# Evaluating Task 3: state of the art or summary of several papers

In [3]:
import numpy as np
from bert_score import BERTScorer
from rouge_score import rouge_scorer
from langchain_community.llms import Ollama
import re
import json

### Load summaries and original

In [4]:
paper_id = "2402.01383v1"
def get_json(file_name):
    # Step 1: Read the JSON file
    with open(file_name + '.json', 'r') as file:
        json_data = json.load(file)
    return json_data
original = get_json('dataset/'+paper_id+'data')
original_text = original['fulltext']
original_text

'LLM-based NLG Evaluation: Current Status and Challenges\nMingqi Gao , Xinyu Hu , Jie Ruan , Xiao Pu ,\nXiaojun Wan\nPeking University\n{gaomingqi, huxinyu, wanxiaojun}@pku.edu.cn, {ruanjie, puxiao}@stu.pku.edu.cn\nAbstract\nEvaluating natural language generation (NLG) is\na vital but challenging problem in artificial intel-\nligence. Traditional evaluation metrics mainly cap-\nturing content (e.g. n-gram) overlap between sys-\ntem outputs and references are far from satisfactory,\nand large language models (LLMs) such as Chat-\nGPT have demonstrated great potential in NLG\nevaluation in recent years. Various automatic evalu-\nation methods based on LLMs have been proposed,\nincluding metrics derived from LLMs, prompting\nLLMs, and fine-tuning LLMs with labeled evalu-\nation data. In this survey, we first give a taxon-\nomy of LLM-based NLG evaluation methods, and\ndiscuss their pros and cons, respectively. We also\ndiscuss human-LLM collaboration for NLG evalu-\nation. Lastly, we disc

In [5]:
def get_summaries(filenames,p_id):
    summaries_list = []
    for name in filenames:
        with open(f'summaries/'+p_id+'/'+name+'.txt', 'r') as file:
            i = file.read()
        summaries_list.append(i)
    return summaries_list

In [6]:
filenames = ['simple_summary'+str(i) for i in range(1,11)]
s = get_summaries(filenames,paper_id)

In [7]:
len(s)

10

In [8]:
scorer_bert = BERTScorer(model_type='bert-base-uncased')
def get_bert(candidate, reference,filename):
    scores = []
    if len(candidate)<100:
        num = len(candidate)
    else:
        num = 100
    for i in range(num):
        P, R, F1 = scorer_bert.score([candidate[i]], [reference])
        scores.append([round(float(P[0]),4),round(float(R[0]),4),round(float(F1[0]),4)])
    
    m = np.mean(scores,axis=0)
    s = np.std(scores,axis=0)
    scores.append(m)
    scores.append(s) 
    np.savetxt('results/'+filename+'_bert.txt',np.matrix(scores),fmt='%.2f')
    return np.matrix(scores)
scores = get_bert(s,original_text,'non_rag')



In [14]:
scores

matrix([[0.5779    , 0.5175    , 0.546     ],
        [0.5903    , 0.5404    , 0.5643    ],
        [0.5724    , 0.5123    , 0.5407    ],
        [0.6048    , 0.5398    , 0.5705    ],
        [0.5865    , 0.5364    , 0.5603    ],
        [0.5906    , 0.5548    , 0.5721    ],
        [0.5863    , 0.5461    , 0.5655    ],
        [0.6037    , 0.5478    , 0.5744    ],
        [0.5877    , 0.5573    , 0.5721    ],
        [0.5981    , 0.4856    , 0.536     ],
        [0.58983   , 0.5338    , 0.56019   ],
        [0.00979745, 0.02116138, 0.01342903]])

In [18]:
c = get_summaries(['complex_summary'+str(i) for i in range(1,11)],paper_id)
get_bert(c,original_text,'complex')

matrix([[0.5972    , 0.5774    , 0.5871    ],
        [0.617     , 0.5872    , 0.6017    ],
        [0.6006    , 0.5926    , 0.5965    ],
        [0.6221    , 0.5859    , 0.6035    ],
        [0.6195    , 0.6089    , 0.6142    ],
        [0.6242    , 0.6048    , 0.6143    ],
        [0.6179    , 0.5985    , 0.608     ],
        [0.6135    , 0.576     , 0.5941    ],
        [0.6304    , 0.6089    , 0.6194    ],
        [0.6163    , 0.5921    , 0.6039    ],
        [0.61587   , 0.59323   , 0.60427   ],
        [0.00961998, 0.01136645, 0.0095288 ]])

In [24]:
rags = get_summaries(['rag5','rag8','rag10'],paper_id)
rags

["Response:  Recent developments in Large Language Models (LLMs) have significantly impacted the field of Natural Language Generation (NLG), leading to new research directions and challenges in evaluating NLG systems using LLMs. In this response, we will discuss the current status and challenges of LLM-based NLG evaluation.\n\nFirstly, it is essential to understand that LLMs have shown remarkable performance on various NLG evaluation tasks (Liu et al., 2021; Raffel et al., 2019). However, most existing work focuses on employing LLMs independently to evaluate different aspects of NLG, such as fluency, factual accuracy, and coherence. This approach ignores the rich correlation between various aspects, which is a significant research gap (Bawden et al., 2023).\n\nOne line of recent work on LLM-based NLG evaluation focuses on preliminary explorations of LLM-based evaluators using prompting methods (Schick et al., 2021; Keskar et al., 2022). These studies aim to leverage the instruction-fol

In [25]:
get_bert(rags,original_text,'rag_results')

matrix([[0.6775    , 0.614     , 0.6442    ],
        [0.6733    , 0.6208    , 0.646     ],
        [0.6619    , 0.6166    , 0.6385    ],
        [0.6709    , 0.61713333, 0.6429    ],
        [0.0065909 , 0.00280159, 0.00319687]])

In [27]:
rags_long = get_summaries(['rag5_large_data','rag8_large_data','rag10_large_data'],paper_id)
rags_long
get_bert(rags_long,original_text,'rag_long_results')

matrix([[0.6526    , 0.6058    , 0.6284    ],
        [0.6503    , 0.61      , 0.6295    ],
        [0.6609    , 0.6197    , 0.6396    ],
        [0.6546    , 0.61183333, 0.6325    ],
        [0.00455265, 0.00582084, 0.0050405 ]])

In [25]:
#Mean over all rag texts
np.mean([[0.6709    , 0.61713333, 0.6429    ],[0.6546    , 0.61183333, 0.6325    ]],axis=0)

array([0.66275   , 0.61448333, 0.6377    ])

In [17]:
full_papers =  get_summaries(['fulltext_summary'],paper_id)
get_bert(full_papers,original_text,'full')

matrix([[0.5063, 0.4332, 0.4669],
        [0.5063, 0.4332, 0.4669],
        [0.    , 0.    , 0.    ]])

### Over all paper ids

In [21]:
ids = ["2402.01383v1","2409.09957","2409.15180","2409.15816","2408.02085","2311.13731","2311.12785"]
#ids = ["2402.01383v1","2409.09957","2409.15180","2409.15816","2408.02085","2311.13731","2402.06196","2408.02304"]
name = 'rag10_large_data'
summaries = [get_summaries([name],id)[0] for id in ids]
originals = [get_json('dataset/'+id+'data')['fulltext'] for id in ids]
originals

['LLM-based NLG Evaluation: Current Status and Challenges\nMingqi Gao , Xinyu Hu , Jie Ruan , Xiao Pu ,\nXiaojun Wan\nPeking University\n{gaomingqi, huxinyu, wanxiaojun}@pku.edu.cn, {ruanjie, puxiao}@stu.pku.edu.cn\nAbstract\nEvaluating natural language generation (NLG) is\na vital but challenging problem in artificial intel-\nligence. Traditional evaluation metrics mainly cap-\nturing content (e.g. n-gram) overlap between sys-\ntem outputs and references are far from satisfactory,\nand large language models (LLMs) such as Chat-\nGPT have demonstrated great potential in NLG\nevaluation in recent years. Various automatic evalu-\nation methods based on LLMs have been proposed,\nincluding metrics derived from LLMs, prompting\nLLMs, and fine-tuning LLMs with labeled evalu-\nation data. In this survey, we first give a taxon-\nomy of LLM-based NLG evaluation methods, and\ndiscuss their pros and cons, respectively. We also\ndiscuss human-LLM collaboration for NLG evalu-\nation. Lastly, we dis

In [22]:
def get_bert_all(candidate, reference,filename):
    scores = []
    if len(candidate)<100:
        num = len(candidate)
    else:
        num = 100
    for i in range(num):
        P, R, F1 = scorer_bert.score([candidate[i]], [reference[i]])
        scores.append([round(float(P[0]),4),round(float(R[0]),4),round(float(F1[0]),4)])
    
    m = np.mean(scores,axis=0)
    s = np.std(scores,axis=0)
    scores.append(m)
    scores.append(s) 
    np.savetxt('results/'+filename+'_bert.txt',np.matrix(scores),fmt='%.2f')
    return np.matrix(scores)

In [23]:
get_bert_all(summaries, originals,name)

matrix([[0.6609    , 0.6197    , 0.6396    ],
        [0.5958    , 0.5836    , 0.5896    ],
        [0.6509    , 0.6229    , 0.6366    ],
        [0.6348    , 0.5999    , 0.6168    ],
        [0.6145    , 0.5857    , 0.5998    ],
        [0.6373    , 0.607     , 0.6218    ],
        [0.5914    , 0.5679    , 0.5794    ],
        [0.62651429, 0.5981    , 0.61194286],
        [0.02473502, 0.01867045, 0.02137868]])

In [41]:
ids

['2402.01383v1', '2409.09957', '2409.15180', '2409.15816']

### Evaluate length of texts

0.06044055943230335

### Text of RAGAS

In [30]:
from datasets import Dataset 
from ragas.metrics import FaithulnesswithHHEM
from ragas import evaluate




TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases

In [None]:
faithfulness_with_hhem = FaithulnesswithHHEM()
data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[faithfulness_with_hhem])
score.to_pandas()