## **Evaluation**

In [None]:
from src.training_utils import *
from src.summarizers import MemSum
from tqdm import tqdm
from rouge_score import rouge_scorer
import json
import numpy as np
import torch

In [None]:
rouge_cal = rouge_scorer.RougeScorer(
    ['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)


model_eval = 'one_head' # Three modes

gpu = True #torch.cuda.is_available()

if model_eval == 'glove':
    pegasus_mode = False
    model_path = "src/model/MemSum_Full/PubMed/memsum/model_batch_1169370.pt"
    embed_dim = 200
    two_heads = False

elif model_eval == 'one_head':
    pegasus_mode = True
    model_path = "src/model/MemSum_Full/PubMed/one_head/model_batch_1003000.pt"
    embed_dim = 768
    two_heads = False
elif model_eval == 'two_heads':
    pegasus_mode = True
    model_path = "src/model/MemSum_Full/PubMed/two_heads/model_batch_735000.pt"
    embed_dim = 768
    two_heads = True





pubmed_test_data = "src/data/PubMed/test_PUBMED.jsonl"


if two_heads:
    with open("src/data/PubMed/Test_ExtAbs_PUBMED.json") as f:
        pubmed_test_data = json.load(f)


memsum_custom_data = MemSum(model_path,
                            "src/model/glove/vocabulary_200dim.pkl",
                            gpu=gpu,  max_doc_len=100, pegasus_mode=pegasus_mode, embed_dim=embed_dim, two_heads = two_heads)

In [None]:
if not two_heads:
    test_corpus_custom_data = [ json.loads(line) for line in open(pubmed_test_data)]
else:
    test_corpus_custom_data = pubmed_test_data

In [None]:
def order_sentences(shuffled_list, ordered_numbers):
    paired_list = list(zip(ordered_numbers, shuffled_list))

    sorted_list = sorted(paired_list, key=lambda x: x[0])

    ordered_sentences = [pair[1] for pair in sorted_list]

    return ordered_sentences[0]


def evaluate( model, corpus, p_stop, max_extracted_sentences, rouge_cal ):
    scores = []
    for data in tqdm(corpus):
        
        # print("original text: \n", data['text'])
        # for el in data['text']:
        #     print(el + '\n')
        
        if not two_heads:
            gold_summary = data["summary"]
            extracted_summary = model.extract( [data["text"]], p_stop_thres = p_stop, return_sentence_position = True, max_extracted_sentences_per_document = max_extracted_sentences )
        else:
            gold_summary = data[1]["summary"]
            gold_abstract = data[0]["abstract"]
            # print("Gold abstract: \n", gold_abstract)

            extracted_summary = model.extract( data, p_stop_thres = p_stop, return_sentence_position = True, max_extracted_sentences_per_document = max_extracted_sentences )
        # print("Gold summary: \n", gold_summary)
        extracted_summary = order_sentences(extracted_summary[0], extracted_summary[1])
        
        print("Extracted Summary: ", extracted_summary)
        score = rouge_cal.score( "\n".join( gold_summary ), "\n".join(extracted_summary)  )
        scores.append( [score["rouge1"].fmeasure, score["rouge2"].fmeasure, score["rougeLsum"].fmeasure ] )
        break
    return np.asarray(scores).mean(axis = 0)

In [None]:
evaluate( memsum_custom_data, test_corpus_custom_data, 0.6, 7, rouge_cal)

### Code to plot the validation results stored in WandB

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Data for ROUGE-1, ROUGE-2, and ROUGE-L
rouge1_memsum = [0.4672, 0.4672, 0.4734, 0.4734, 0.4773, 0.4780, 0.4780, 0.4801, 0.4802, 0.4799, 0.4811, 0.4810, 0.4821, 0.4824, 0.4819]
rouge2_memsum = [0.2126, 0.2125, 0.2167, 0.2166, 0.2191, 0.2199, 0.2198, 0.2217, 0.2218, 0.2213, 0.2224, 0.2230, 0.2236, 0.2236, 0.2233]
rougeL_memsum = [0.4260, 0.4260, 0.4309, 0.4309, 0.4349, 0.4345, 0.4345, 0.4370, 0.4371, 0.4361, 0.4377, 0.4376, 0.4390, 0.4384, 0.4374]

rouge1_transformer = [0.4727, 0.4759, 0.4800, 0.4800, 0.4786, 0.4787, 0.4807, 0.4813, 0.4794, 0.4831, 0.4829, 0.4834]
rouge2_transformer = [0.2177, 0.2219, 0.2246, 0.2246, 0.2245, 0.2245, 0.2241, 0.2258, 0.2245, 0.2271, 0.2266, 0.2262]
rougeL_transformer = [0.4332, 0.4361, 0.4396, 0.4395, 0.4386, 0.4387, 0.4401, 0.4414, 0.4400, 0.4427, 0.4421, 0.4417]

rouge1_transformer_cross = [0.4627, 0.4687, 0.4619, 0.4707, 0.4455, 0.4405, 0.4447, 0.4448, 0.4403, 0.4376, 0.4354, 0.4379, 0.4367]
rouge2_transformer_cross = [0.2069, 0.2110, 0.2082, 0.2136, 0.1936, 0.192, 0.194, 0.1932, 0.1909, 0.1880, 0.1865, 0.1885]
rougeL_transformer_cross = [0.4233, 0.4292, 0.4204, 0.4306, 0.4077, 0.4002, 0.4042, 0.4054, 0.3997, 0.3970, 0.3934, 0.3967, 0.3952]

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(24, 10))  # Adjust the figsize as needed

# ROUGE-1
sns.lineplot(data=[rouge1_memsum, rouge1_transformer, rouge1_transformer_cross], ax=axes[0])
axes[0].set_title("ROUGE-1")
axes[0].legend(['MS Vanilla', 'MS Transformer', 'MS Transformer+ Cross Attn'])

# ROUGE-2
sns.lineplot(data=[rouge2_memsum, rouge2_transformer, rouge2_transformer_cross], ax=axes[1])
axes[1].set_title("ROUGE-2")
axes[1].legend(['MS Vanilla', 'MS Transformer', 'MS Transformer+ Cross Attn'])

# ROUGE-L
sns.lineplot(data=[rougeL_memsum, rougeL_transformer, rougeL_transformer_cross], ax=axes[2])
axes[2].set_title("ROUGE-L")
axes[2].legend(['MS Vanilla', 'MS Transformer', 'MS Transformer+ Cross Attn'])

# Adjust layout
plt.tight_layout()
plt.savefig("rouge_results.pdf")
# Display the plot
plt.show()
