# Evaluating Task 3: state of the art or summary of several papers

In [1]:
import numpy as np
from bert_score import BERTScorer
from rouge_score import rouge_scorer
from langchain_community.llms import Ollama
import re
import json

  from .autonotebook import tqdm as notebook_tqdm


### Load summaries and original

In [2]:
paper_id = "2402.01383v1"
def get_json(file_name):
    # Step 1: Read the JSON file
    with open(file_name + '.json', 'r') as file:
        json_data = json.load(file)
    return json_data
original = get_json('dataset/'+paper_id+'data')
original_text = original['fulltext']
original_text

'LLM-based NLG Evaluation: Current Status and Challenges\nMingqi Gao , Xinyu Hu , Jie Ruan , Xiao Pu ,\nXiaojun Wan\nPeking University\n{gaomingqi, huxinyu, wanxiaojun}@pku.edu.cn, {ruanjie, puxiao}@stu.pku.edu.cn\nAbstract\nEvaluating natural language generation (NLG) is\na vital but challenging problem in artificial intel-\nligence. Traditional evaluation metrics mainly cap-\nturing content (e.g. n-gram) overlap between sys-\ntem outputs and references are far from satisfactory,\nand large language models (LLMs) such as Chat-\nGPT have demonstrated great potential in NLG\nevaluation in recent years. Various automatic evalu-\nation methods based on LLMs have been proposed,\nincluding metrics derived from LLMs, prompting\nLLMs, and fine-tuning LLMs with labeled evalu-\nation data. In this survey, we first give a taxon-\nomy of LLM-based NLG evaluation methods, and\ndiscuss their pros and cons, respectively. We also\ndiscuss human-LLM collaboration for NLG evalu-\nation. Lastly, we disc

In [3]:
def get_summaries(filenames):
    summaries_list = []
    for name in filenames:
        with open(f'summaries/'+paper_id+'/'+name+'.txt', 'r') as file:
            i = file.read()
        summaries_list.append(i)
    return summaries_list

In [5]:
filenames = ['simple_summary'+str(i) for i in range(1,11)]
s = get_summaries(filenames)

In [6]:
len(s)

10

In [13]:
scorer_bert = BERTScorer(model_type='bert-base-uncased')
def get_bert(candidate, reference,filename):
    scores = []
    if len(candidate)<100:
        num = len(candidate)
    else:
        num = 100
    for i in range(num):
        P, R, F1 = scorer_bert.score([candidate[i]], [reference])
        scores.append([round(float(P[0]),4),round(float(R[0]),4),round(float(F1[0]),4)])
    
    m = np.mean(scores,axis=0)
    s = np.std(scores,axis=0)
    scores.append(m)
    scores.append(s) 
    np.savetxt('results/'+filename+'_bert.txt',np.matrix(scores),fmt='%.2f')
    return np.matrix(scores)
scores = get_bert(s,original_text,'non_rag')



In [14]:
scores

matrix([[0.5779    , 0.5175    , 0.546     ],
        [0.5903    , 0.5404    , 0.5643    ],
        [0.5724    , 0.5123    , 0.5407    ],
        [0.6048    , 0.5398    , 0.5705    ],
        [0.5865    , 0.5364    , 0.5603    ],
        [0.5906    , 0.5548    , 0.5721    ],
        [0.5863    , 0.5461    , 0.5655    ],
        [0.6037    , 0.5478    , 0.5744    ],
        [0.5877    , 0.5573    , 0.5721    ],
        [0.5981    , 0.4856    , 0.536     ],
        [0.58983   , 0.5338    , 0.56019   ],
        [0.00979745, 0.02116138, 0.01342903]])

In [12]:
c = get_summaries(['complex_summary'])
get_bert(c,original_text,'complex')

matrix([[0.5972, 0.5774, 0.5871]])

In [9]:
rags = get_summaries(['rag10','rag8','rag5'])
rags

["Response:  Recent developments in Large Language Models (LLMs) have significantly impacted the field of Natural Language Generation (NLG), leading to new research directions and challenges in NLG evaluation. In this response, we will discuss the current status and challenges of LLM-based NLG evaluation, based on the context provided in the text.\n\nFirstly, it is essential to understand that NLG evaluation is a long-standing task in Natural Language Processing (NLP), which has become more challenging with the rapid development of LLMs (Celikyilmaz et al., 2020; Chang et al., 2023). Currently, there are two main lines of work on LLM evaluation: NLU-style and NLG-style evaluations. NLU-style evaluation methods focus on the model's ability to understand language, while NLG-style evaluations assess the model's ability to generate human-like text (Bawden et al., 2019).\n\nRecent research works have focused on LLM-based evaluators due to their promising instruction-following and generaliza

In [10]:
get_bert(rags,original_text,'rag_results')

matrix([[0.6619, 0.6166, 0.6385],
        [0.6733, 0.6208, 0.646 ],
        [0.6775, 0.614 , 0.6442]])

### Text of RAGAS

In [30]:
from datasets import Dataset 
from ragas.metrics import FaithulnesswithHHEM
from ragas import evaluate




TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases

In [None]:
faithfulness_with_hhem = FaithulnesswithHHEM()
data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[faithfulness_with_hhem])
score.to_pandas()