In [3]:
from langchain_community.llms import Ollama
import json
import random
import re
import os

# Combine several papers

In [5]:

# The original state of the art paper
paper_id = "2402.01383v1"


In [17]:
def get_json(file_name):
    # Step 1: Read the JSON file
    with open(file_name + '.json', 'r') as file:
        json_data = json.load(file)
    return json_data
def write_json(dict_data,file_name):
    with open('summaries/'+file_name+'.json', 'w') as file:
        json.dump(dict_data, file, indent=4)



#store the summaries
os.makedirs('summaries/'+paper_id,exist_ok=True)
def store_summary(content,filename):
    path = 'summaries/'+paper_id
    with open(path+'/'+filename, 'w') as file:
        file.write(content)

In [7]:
original = get_json('dataset/'+paper_id+'data')
topic = original['title']
topic

'LLM-based NLG Evaluation: Current Status and Challenges'

In [8]:
#Get full texts of related papers
data = get_json('dataset/'+paper_id+'full_texts')

In [9]:
# Get the llm mistral
llm = Ollama(model = "mistral",temperature=0)

In [10]:
def summarize_n(papers):
    l = [key+':  '+value for key,value in papers.items()]
    random.shuffle(l)
    context = ('/n/n').join(l)
    instruction = 'summarize the following papers:/n'+context+'/n summarize them.'
    a = llm.invoke(instruction)
    return a

In [11]:
longsummary = summarize_n(data)
longsummary
#type(random.shuffle([key+':  '+value for key,value in data.items()]))
#complete fail

' The first answer about defensive strategies in basketball is rated as having a score of 3 out of 5. The accuracy of the answer is assessed as 3, indicating that it correctly identifies the World Health Organization (WHO) as the primary organization responsible for global health policies and guidelines. However, the coherence, factuality, and comprehensiveness scores are lower due to some errors in the description of defensive strategies and a lack of detail about the process by which WHO develops and implements these policies.\n\nThe second answer regarding attachment styles and romantic relationships is rated as having a score of 4 out of 5. The accuracy of the answer is assessed as 3, indicating that it correctly describes how attachment style influences romantic relationships. However, the coherence, factuality, and comprehensiveness scores are higher due to the clear and detailed explanation of each attachment style and its impact on relationships.\n\nThe third question asks abou

In [13]:
store_summary(longsummary,'fulltext_summary.txt')

In [14]:
#Is the problem that context is too big? 
l = [key+':  '+value for key,value in data.items()]
context = ('/n/n').join(l)
len(context) #3'924'350 characters. This is way too big as 
len(context.split(' ')) # 499'106 words but only 128'000 in theory are allowed. The llm was trained with 8'000 words in the prompt.
#yes

499106

### Try with only abstracts. Is it short enough?

In [18]:
#Get full texts of related papers
data = get_json('dataset/'+paper_id+'ref')

In [19]:
l = [value['title']+':  '+value['abstract'] for value in data.values()]
context = ('/n/n').join(l)
len(context.split(' ')) 
#7956 is short enough

7956

In [20]:
def write_summary(summaries):
    texts = ('\n').join([s['title']+':  '+s['abstract'] for s in summaries.values()])
    instruction = f'Write a summary combining the key findings of following texts: \n {texts}'
    a = llm.invoke(instruction)
    return a

In [21]:
simple_summary = write_summary(data)
simple_summary

' This text discusses several research papers related to evaluating and using large language models (LLMs) as judges for various tasks. The first paper explores the use of wider and deeper networks for fairer LLM evaluations, drawing inspiration from deep neural networks and academic paper reviewing. The second paper proposes using strong LLMs as judges for evaluating other LLMs on more open-ended questions and introduces two benchmarks: MT-bench and Chatbot Arena. The third paper proposes fine-tuning LLMs as scalable judges (JudgeLM) to evaluate LLMs efficiently and effectively in open-ended benchmarks.\n\nThe first paper, "Wider and Deeper LLM Networks are Fairer LLM Evaluators," discusses the challenges of evaluating the quality of responses generated by LLMs and proposes a method using an LLM itself to make evaluations through multiple independent evaluations. The authors explore whether deeper and wider networks can lead to fairer evaluations, inspired by the observation that diff

In [22]:
store_summary(simple_summary,'simple_summary.txt')

In [23]:
print(('\n').join([s['title'] for s in data.values()]))
#The first paper, "Wider and Deeper LLM Networks are Fairer LLM Evaluators,"... 
#=> it did only consider the last three papers

Concrete Problems in AI Safety
PaLM 2 Technical Report
Benchmarking Foundation Models with Language-Model-as-an-Examiner
ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate
BooookScore: A systematic exploration of book-length summarization in the era of LLMs
Scaling Instruction-Finetuned Language Models
RAGAS: Automated Evaluation of Retrieval Augmented Generation
GPTScore: Evaluate as You Desire
Are Large Language Models Reliable Judges? A Study on the Factuality Evaluation Capabilities of LLMs
CoAScore: Chain-of-Aspects Prompting for NLG Evaluation
ALLURE: Auditing and Improving LLM-based Evaluation of Text using Iterative In-Context-Learning
SocREval: Large Language Models with the Socratic Method for Reference-Free Reasoning Evaluation
Exploring ChatGPT's Ability to Rank Content: A Preliminary Study on Consistency with Human Preferences
TIGERScore: Towards Building Explainable Metric for All Text Generation Tasks
CritiqueLLM: Towards an Informative Critique Gen

In [24]:
def write_summary_with_topic(summaries,topic):
    texts = ('\n').join([s['title']+':  '+s['abstract'] for s in summaries.values()])
    instruction = f'Use this {texts} to create an overview over the topic {topic}.'
    a = llm.invoke(instruction)
    return a

In [25]:
simple_overview = write_summary_with_topic(data,topic)
store_summary(simple_overview,'simple_overview.txt')
simple_overview

' Title: LLM-based NLG Evaluation: Current Status and Challenges - Wider and Deeper Networks for Fairer Evaluators, LLM-as-a-Judge, and JudgeLM\n\nThis topic covers recent research on evaluating Large Language Models (LLMs) in Natural Language Generation (NLG) tasks. The studies explore various approaches to improve the evaluation process, including wider and deeper networks for fairer evaluators, using LLMs as judges, and fine-tuning LLMs as scalable judges.\n\n1. Wider and Deeper LLM Networks are Fairer LLM Evaluators:\nThis paper proposes a novel approach to use the LLM itself to make evaluations and improve fairness by designing a network that resembles academic paper reviewing. The network consists of multiple layers, with each layer receiving representations from all neurons in the previous layer, integrating locally learned evaluation information for more comprehensive results. Experimental results demonstrate that a wider network (involving many reviewers) with 2 layers perform

In [26]:
#As well only considers the last 3 papers. Is it because they are the last three papers or because they are in llm's opinion the most relevant?

In [33]:
def write_summary_with_shuffle(summaries,topic):
    l = [s['title']+':  '+s['abstract'] for s in summaries.values()]
    random.shuffle(l)
    print('last 3 papers:',l[-3])
    print(l[-2])
    print(l[-1])
    texts = ('\n').join(l)
    instruction = f'Use this {texts} to create an overview over the topic {topic}.'
    a = llm.invoke(instruction)
    return a

In [34]:
simple_overview_s = write_summary_with_shuffle(data,topic)
store_summary(simple_overview_s,'simple_overview_shuffled.txt')
simple_overview_s

last 3 papers: Branch-Solve-Merge Improves Large Language Model Evaluation and Generation:  Large Language Models (LLMs) are frequently used for multi-faceted language
generation and evaluation tasks that involve satisfying intricate user
constraints or taking into account multiple aspects and criteria. However,
their performance can fall short, due to the model's lack of coherence and
inability to plan and decompose the problem. We propose Branch-Solve-Merge
(BSM), a Large Language Model program (Schlag et al., 2023) for tackling such
challenging natural language tasks. It consists of branch, solve, and merge
modules that are parameterized with specific prompts to the base LLM. These
three modules plan a decomposition of the task into multiple parallel
sub-tasks, independently solve them, and fuse the solutions to the sub-tasks.
We apply our method to the tasks of LLM response evaluation and constrained
text generation and evaluate its effectiveness with multiple LLMs, including
Vicun

" Title: Recent Advances in Large Language Model Evaluation: TIGERScore, EvalLM, Branch-Solve-Merge, and FLASK\n\nIn recent years, there has been a surge of interest in evaluating the performance of large language models (LLMs) in natural language generation (NLG) tasks. In this summary, we will discuss four recent studies that have made significant strides in developing new methods for evaluating LLMs: TIGERScore, EvalLM, Branch-Solve-Merge, and FLASK.\n\n1. TIGERScore: A Universal Explainable Metric for NLG Tasks\nTIGERScore is a reference-free metric that evaluates the rationality of explanations generated by LLMs. The study demonstrates that TIGERScore's correlation with human ratings is high and approaches that of GPT-4 evaluators. Human evaluation of the generated explanations showed an accuracy of 70.8%. The researchers believe that TIGERScore shows the potential for building universal explainable metrics to evaluate any NLG task.\n\n2. EvalLM: Interactive System for Iteratively

### Write in several steps
First outline, 

In [37]:
def outline(summaries,topic):
    instruction = f'''I am creating a state-of-the-art survey on {topic}.The survey should cover trends, methodologies, key findings, and future directions. 
    Could you help outline a structure for this paper?'''
    info = ('\n').join([s['title']+':  '+s['abstract'] for s in summaries.values()])
    texts = f'''Use following context only {info}. '''
    answer = llm.invoke(texts+instruction)
    return answer

In [38]:
outline =outline(data,topic)
outline

" Title: State-of-the-Art Survey on LLM-Based NLG Evaluation: Trends, Methodologies, Key Findings, and Future Directions\n\n1. Introduction\n* Brief overview of large language models (LLMs) and natural language generation (NLG)\n* Importance of evaluating LLMs in open-ended scenarios\n* Motivation for the survey: current challenges and limitations of existing benchmarks and metrics\n\n2. Background and Related Work\n* Overview of LLMs and their applications in NLG\n* Existing benchmarks and metrics for evaluating LLMs in NLG tasks\n* Previous surveys on NLG evaluation and their contributions\n\n3. Trends in LLM-Based NLG Evaluation\n* Recent advances in designing and implementing LLMs for NLG tasks\n* Emerging applications of LLMs in NLG evaluation, such as adversarial perturbations, model ensembles, and fairness\n\n4. Methodologies for LLM-Based NLG Evaluation\n* Approaches to evaluating LLMs in open-ended scenarios, including human evaluation, automatic metrics, and hybrid methods\n*

In [39]:
def outline(topic):
    instruction = f'''I am creating a state-of-the-art survey on {topic}.The survey should cover trends, methodologies, key findings, and future directions. 
    Could you help outline a structure for this paper?'''
    answer = llm.invoke(instruction)
    return answer

In [40]:
outline2 =outline(topic)
outline2

' Title: LLM-Based NLG Evaluation: Current Status and Challenges\n\n1. Introduction\n   * Brief overview of Natural Language Generation (NLG) and its importance in AI and HCI\n   * Explanation of the role of Large Language Models (LLMs) in NLG and their recent advancements\n   * Objective of the survey: to discuss trends, methodologies, key findings, and future directions in LLM-based NLG evaluation\n\n2. Trends in LLM-Based NLG Evaluation\n   * Description of the growing interest in evaluating LLM-based NLG systems\n   * Overview of popular applications and industries where LLM-based NLG is being used (e.g., customer service, content generation, education)\n   * Discussion on the increasing availability of large datasets for training and evaluating LLMs\n\n3. Methodologies for Evaluating LLM-Based NLG Systems\n   * Description of various evaluation metrics and techniques used in NLG research, focusing on those specifically designed for LLM-based systems (e.g., BLEU, ROUGE, PER, Human 

In [41]:
print(outline2)

 Title: LLM-Based NLG Evaluation: Current Status and Challenges

1. Introduction
   * Brief overview of Natural Language Generation (NLG) and its importance in AI and HCI
   * Explanation of the role of Large Language Models (LLMs) in NLG and their recent advancements
   * Objective of the survey: to discuss trends, methodologies, key findings, and future directions in LLM-based NLG evaluation

2. Trends in LLM-Based NLG Evaluation
   * Description of the growing interest in evaluating LLM-based NLG systems
   * Overview of popular applications and industries where LLM-based NLG is being used (e.g., customer service, content generation, education)
   * Discussion on the increasing availability of large datasets for training and evaluating LLMs

3. Methodologies for Evaluating LLM-Based NLG Systems
   * Description of various evaluation metrics and techniques used in NLG research, focusing on those specifically designed for LLM-based systems (e.g., BLEU, ROUGE, PER, Human Evaluation)
  

In [42]:
def intro(topic,out):
    ins = f'''Here is the outline for a state-of-the-art survey on {topic} that I created earlier: {out}
    Using this outline, can you draft an introduction that includes the background, significance of the topic, the aim of the survey, and what the reader can expect 
    from the paper?'''
    answer  = llm.invoke(ins)
    return answer

In [43]:
intro = intro(topic,outline2)
intro

" Title: LLM-Based NLG Evaluation: Current Status and Challenges\n\nIntroduction:\n\nNatural Language Generation (NLG) has emerged as a crucial component in Artificial Intelligence (AI) and Human-Computer Interaction (HCI), enabling machines to generate human-like text based on data or user inputs. One of the most significant advancements in NLG research is the use of Large Language Models (LLMs) to generate natural language responses, leading to impressive improvements in various applications such as customer service, content generation, and education.\n\nThe role of LLMs in NLG has gained increasing attention due to their ability to learn from vast amounts of data and generate text that closely resembles human-written language. However, with this growing interest comes the need for rigorous evaluation methods to ensure the quality and effectiveness of these systems. In this survey, we aim to discuss the current trends, methodologies, key findings, and future directions in LLM-based N

In [44]:
def lit(topic,summaries,out):
    papers = ('\n').join([s['title']+':  '+s['abstract'] for s in summaries.values()])
    ins=f'''I am working on a survey on {topic}. The outline is as follows: {out}
    I have collected the following key papers for the Thematic Overview section:
    {papers}
    Can you summarize these papers and discuss how they contribute to the state of the art in this area?'''
    answer  = llm.invoke(ins)
    return answer

In [45]:
lit = lit(topic,data,outline2)
lit

" These papers explore different aspects of evaluating large language models (LLMs) for text generation and dialogue systems. Here's a summary of each paper and their contributions:\n\n1. BatchEval: The authors propose a new paradigm called BatchEval to address the limitations of current sample-wise evaluation methods, such as sensitivity to prompt design, poor resistance to noise, and inferior ensemble performance with static references. They demonstrate that BatchEval outperforms state-of-the-art methods by 10.5% on Pearson correlations with only 64% API cost on average.\n\n2. A Comprehensive Analysis of the Effectiveness of Large Language Models as Automatic Dialogue Evaluators: This study analyzes the effectiveness of LLMs in automatic dialogue evaluation, examining their strengths and limitations. The authors show that strong LLM judges like GPT-4 can match human preferences well, making LLM-as-a-judge a scalable and explainable way to approximate human preferences.\n\n3. JudgeLM:

In [46]:
print(lit)

 These papers explore different aspects of evaluating large language models (LLMs) for text generation and dialogue systems. Here's a summary of each paper and their contributions:

1. BatchEval: The authors propose a new paradigm called BatchEval to address the limitations of current sample-wise evaluation methods, such as sensitivity to prompt design, poor resistance to noise, and inferior ensemble performance with static references. They demonstrate that BatchEval outperforms state-of-the-art methods by 10.5% on Pearson correlations with only 64% API cost on average.

2. A Comprehensive Analysis of the Effectiveness of Large Language Models as Automatic Dialogue Evaluators: This study analyzes the effectiveness of LLMs in automatic dialogue evaluation, examining their strengths and limitations. The authors show that strong LLM judges like GPT-4 can match human preferences well, making LLM-as-a-judge a scalable and explainable way to approximate human preferences.

3. JudgeLM: The au

In [49]:
def challenges(topic,summaries):
    papers = ('\n').join([s['title']+':  '+s['abstract'] for s in summaries.values()])
    ins = f'''Here’s a summary of key papers in the field of {topic}:{papers}
    Based on these summaries, can you identify the major trends, common methodologies, and challenges in the current research? What are the gaps that still need to be 
    addressed?'''
    answer  = llm.invoke(ins)
    return answer

In [50]:
challenges  = challenges(topic,data)
challenges

' The major trends in the current research on large language models (LLMs) include:\n\n1. Evaluation and benchmarking of LLMs: Researchers are exploring various methods to evaluate the quality of responses generated by LLMs, especially in open-ended scenarios where existing benchmarks may not be comprehensive. This includes using LLMs themselves as evaluators, fine-tuning LLMs as judges, and constructing large and diverse evaluation datasets.\n2. Robustness and adversarial perturbations: Researchers are investigating the robustness of LLMs in handling various types of adversarial perturbations at both turn and dialogue levels. This includes probing their ability to handle misinformation, biased data, and other forms of adversarial attacks.\n3. Fairness and bias: There is a growing interest in understanding the fairness and potential biases in LLMs, particularly in areas such as language generation and evaluation. Researchers are exploring methods to mitigate these biases and ensure tha

In [53]:
def conclusion(topic,intro,l,chall):
    ins = f'''Here we have the topic: {topic}. The introduction {intro}. The review {l} and the challenges {chall}.
    Can you draft a conclusion that summarizes the key findings and discusses the overall state of the field and future outlook?'''
    answer  = llm.invoke(ins)
    return answer

In [54]:
con = conclusion(topic,intro,lit,challenges)
con 

' Conclusion:\n\nIn this survey, we have explored the current trends, methodologies, key findings, and challenges in evaluating Large Language Models (LLMs) for Natural Language Generation (NLG). The growing interest in LLM-based NLG systems is driven by their potential impact on various industries and society as a whole. However, with this increasing attention comes the need for rigorous evaluation methods to ensure their quality and effectiveness.\n\nThe papers reviewed in this survey demonstrate significant progress in evaluating LLMs for text generation and dialogue systems. They propose new methods such as BatchEval, JudgeLM, and using LLMs as judges, analyze their strengths and limitations, and address challenges related to open-ended scenarios and measuring human preferences. These contributions provide valuable insights into the potential of using LLMs as evaluators and judges for various applications.\n\nThe current research on LLMs is focused on evaluation and benchmarking, r

In [55]:
def conmbine(intro,lit,challenges,con,out):
    ins = f''' Based on the introduction {intro}, the review {lit}, the challenges {challenges} and the conclusion {con}. 
    Combine it to a fluent paper with the outline {out}.''' 
    answer  = llm.invoke(ins)
    return answer

In [56]:
combined_survey  = conmbine(intro,lit,challenges,con,outline2)


In [57]:
store_summary(combined_survey,'combined_survey.txt')

In [58]:
combine = intro+'/n'+lit+'/n'+challenges+'/n'+con
store_summary(combine,'combine.txt')