In [23]:
from langchain_community.llms import Ollama
import json
import random
import re
import numpy as np
#import pandas as pd


# Classifiaction
Can the llm classify if a paper is relevant to a given topic?

In [24]:
def get_json(file_name):
    # Step 1: Read the JSON file
    with open(file_name + '.json', 'r') as file:
        json_data = json.load(file)
    return json_data
def write_json(dict_data,file_name):
    with open('results/'+file_name+'.json', 'w') as file:
        json.dump(dict_data, file, indent=4)

In [25]:
# The original state of the art paper
paper_id = "2402.01383v1"
original = get_json('../task3/dataset/'+paper_id+'data')
original['title']
topic = original['title']+': '+ original['abstract']
topic

'LLM-based NLG Evaluation: Current Status and Challenges: Evaluating natural language generation (NLG) is a vital but challenging\nproblem in artificial intelligence. Traditional evaluation metrics mainly\ncapturing content (e.g. n-gram) overlap between system outputs and references\nare far from satisfactory, and large language models (LLMs) such as ChatGPT\nhave demonstrated great potential in NLG evaluation in recent years. Various\nautomatic evaluation methods based on LLMs have been proposed, including\nmetrics derived from LLMs, prompting LLMs, and fine-tuning LLMs with labeled\nevaluation data. In this survey, we first give a taxonomy of LLM-based NLG\nevaluation methods, and discuss their pros and cons, respectively. We also\ndiscuss human-LLM collaboration for NLG evaluation. Lastly, we discuss several\nopen problems in this area and point out future research directions.'

### Small dataset: Give a list of papers
Here there are 10 relevant and 10 unrelated papers. The unrelated are papers related to 'data science'. The dataset was created in the file create_dataset.ipynb


In [26]:
data = get_json('dataset/'+paper_id)
data.keys()

dict_keys(['1802.03292', '2311.09184', '0907.3804', '2307.10928', '2303.07610', '2002.05658', '1610.07365', '2310.00752', '2310.00785', '2310.19740', '2207.07901', '1612.04037', '2310.00074', '2310.17631', '1501.05039', '2302.04166', '2311.09204', '1908.05986', '2205.01553', '1309.0717'])

In [27]:


def get_labels(d):
    labels = [details['label'] for id, details in d.items()]
    return labels
def get_relevant_ids(d):
    ids = [id for id, details in d.items() if details['label']==1]
    return ids
# Usage example
labels = get_labels(data)
print(labels)
ids = get_relevant_ids(data)
print(ids)


[0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0]
['2311.09184', '2307.10928', '2303.07610', '2310.00752', '2310.00785', '2310.19740', '2310.00074', '2310.17631', '2302.04166', '2311.09204']


In [28]:
# Get the llm mistral
llm = Ollama(model = "mistral",temperature=0)

In [29]:
def select_with_summary_id(topic,summaries):
    choice = "\n".join(["**"+str(n)+"** "+i['title']+i['abstract'] for n,i in summaries.items()]) #long version with abstracts to all papers
    #choice = "\n".join(["**"+str(n)+"** "+i['title'] for n,i in summaries.items()]) #short version with only titles of all papers.
    instruction = f"""Your task is to determine which papers are relevant to the topic: {topic}.

    Below is a list of papers. Each paper is numbered and includes its title and summary.

    List of Papers:
    {choice}

    Instructions:

    Indicate which papers are relevant to the topic by writing only the numbers of the relevant papers.
    Do not provide any explanations or use any other numbers.
    Format your answer as a list of numbers separated by commas.
    Please provide your answer below:"""
    print(len(instruction.split(' ')))
    answer = llm.invoke(instruction)

    return answer

In [30]:
answer = select_with_summary_id(topic,data)
answer

2715


' 2310.17631, 2311.09204, 1501.05086, 2311.09204, 1908.05986, 2205.01553, 1309.0717'

In [16]:
n = re.findall(r'\d{4}\.\d{5}',answer)
evaluate_step(n)

(9, 0, 1, 10)

In [17]:
def find_hallucinations(a):
    h = []
    for number in a:
        if number not in data.keys():
            h.append(number)
    return h
def find_duplicates(lst):
    return list(set([item for item in lst if lst.count(item) > 1]))

#Example
print(find_duplicates([1,1,1,2,2])) 


[1, 2]


### Find out if the order of the papers in the list matters
The list is shuffled 9 times and the result is evaluated

In [18]:
def evaluate_step(n3):
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for key, value in data.items():
        if key in n3:
            if value['label']==1:
                TP+=1
            else:
                FP+=1
        else:
            if value['label']==1:
                FN+=1
            else:
                TN+=1
    return(TP,FP,FN,TN)

In [19]:


def run():
    scores = {}
    scores['run'] = {}
    combined = []
    
    for i in range(10):
        scores['run'][i] = {}
        experiment = scores['run'][i]
        
        #shuffle data
        items = list(data.items())    
        random.shuffle(items)
        shuffled_dict = dict(items)
        
        #Get answer
        answer = select_with_summary_id(topic,shuffled_dict)
        n = re.findall(r'\d{4}\.\d{5}',answer)
        
        #store the numbers
        experiment['answer'] = n
        combined = combined+n
        
        #caluculate metrics
        TP,FP,FN,TN = evaluate_step(n)
        experiment['n_predictions'] = len(n)
        experiment['n_correct'] = TP
        experiment['P'] = TP/len(n)
        experiment['R'] = TP/(TP+FN)
        experiment['F1'] = 2*experiment['P']*experiment['R']/(experiment['P']+experiment['R'])
    
    print(scores)
    print(n)
    scores['mean_P'] = np.mean([scores['run'][s]['P'] for s in scores['run'].keys()])
    scores['mean_R'] = np.mean([scores['run'][s]['R'] for s in scores['run'].keys()])
    scores['mean_F1'] = np.mean([scores['run'][s]['F1'] for s in scores['run'].keys()])
    
    
    #Get metric of combined
    TP,FP,FN,TN = evaluate_step(combined)
    scores['combined'] = {}
    scores['combined']['answer'] = combined
    
    scores['combined']['n_predictions'] = len(combined)
    scores['combined']['n_correct'] = TP
    scores['combined']['P'] = TP/len(combined)
    scores['combined']['R'] = TP/(TP+FN)
    scores['combined']['F1'] = 2*scores['combined']['P']*scores['combined']['R']/(scores['combined']['P']+scores['combined']['R'])
    scores['combined']['hallucinations']=list(np.unique(find_hallucinations(combined)))

    #Get metric of taking only results that are present at least once
    duplicates = find_duplicates(combined)
    TP,FP,FN,TN = evaluate_step(duplicates)
    scores['duplicates'] = {}
    scores['duplicates']['n_predictions'] = TP+FP
    scores['duplicates']['n_correct'] = TP
    scores['duplicates']['P'] = TP/(TP+FP)
    scores['duplicates']['R'] = TP/(TP+FN)
    scores['duplicates']['F1'] = 2*scores['duplicates']['P']*scores['duplicates']['R']/(scores['duplicates']['P']+scores['duplicates']['R'])
    scores['duplicates']['hallucinations']=list(np.unique(find_hallucinations(duplicates)))

    return scores
result = run()
print(result)

380
380
380
380
380
380
380
380
380
380
{'run': {0: {'answer': ['2310.00785', '2311.09184', '2310.19740', '2310.17631', '2310.00752', '2307.10928', '2310.00074', '2311.09204', '2310.17631', '2310.00752', '2303.07610', '2311.09812', '2310.00785', '2310.17631', '2002.05658', '2310.00074', '1610.07365', '2310.00752', '1908.05986', '1501.05039', '2310.00785', '2310.17631', '1802.03292', '2205.01553', '1612.04037', '2002.05658', '2302.04166', '2307.10928', '2310.19740', '2310.17631', '2310.00752', '2310.00074', '2311.09204', '2310.00785', '2311.09184', '1610.07365', '2310.00752', '2205.01553', '1802.03292', '2302.04166', '2307.10928', '2310.19740', '2310.17631', '2310.00752', '2310.00074', '2311.09204', '2310.00785', '2310.17631', '1610.07365', '2310.00752', '1908.05986', '1501.05039', '2310.00785', '2310.17631', '1802.03292', '2205.01553', '1612.04037', '2002.05658', '2302.04166', '2307.10928', '2310.19740', '2310.17631', '2310.00752', '2310.00074', '2311.09204', '2310.00785', '2310.17631'

In [21]:
#write_json(result,'summaries_list')
write_json(result,'summaries_list_short')

    

In [22]:
#Create tables
def P_R_F_table(s):
    table = []
    for k,v in s['run'].items():
        table.append([np.round(v['P'],2),np.round(v['R'],2),np.round(v['F1'],2)])
    sd = list(np.std(table,axis=0))
    table.append([np.round(s['mean_P'],2),np.round(s['mean_R'],2),np.round(s['mean_F1'],2)])
    table.append(sd)

    return(table)

table = P_R_F_table(result)
print(table)
np.savetxt('results/P_R_F1_short.txt',table,fmt='%.2f')

[[0.01, 1.0, 0.01], [1.0, 0.5, 0.67], [0.56, 0.9, 0.69], [1.0, 0.7, 0.82], [0.64, 0.9, 0.75], [1.0, 0.9, 0.95], [1.0, 0.7, 0.82], [0.9, 0.9, 0.9], [0.38, 1.0, 0.56], [1.0, 0.8, 0.89], [0.75, 0.83, 0.71], [0.32675526009538086, 0.14866068747318506, 0.2581162528784269]]


The order clearly matter as there are always different answers. But also the results are bad.

In [33]:
#Create other table
def predicted_table(s):
    table = []
    for k,v in s['run'].items():
        table.append([len(v["answer"]),v['n_correct'],len(find_hallucinations(v["answer"])),len(find_duplicates(v["answer"]))])
    m = list(np.mean(table,axis=0))
    sd = list(np.std(table,axis=0))
    table.append(m)
    table.append(sd)
    return(table)
table = predicted_table(result)
print(table)
np.savetxt('results/predictions.txt',table,fmt='%.2f')

[[4, 2, 0, 1], [3, 1, 1, 0], [2, 1, 0, 0], [4, 2, 0, 0], [5, 3, 1, 0], [5, 2, 2, 0], [2, 2, 0, 0], [4, 3, 0, 0], [8, 3, 2, 2], [9, 2, 1, 1], [4.6, 2.1, 0.7, 0.4], [2.2, 0.7000000000000001, 0.7810249675906654, 0.66332495807108]]


### Ask every paper seperatly if it is relevant

In [34]:
#Is it better to ask for each paper separatly if it is relevant?
one = list(data.keys())[1]
print(one)
def ask_relevant(topic,id):
    paper = data[id]
    choice = paper['title']+paper['abstract']
    instruction = f"""Your task is to determine which papers are relevant to the topic: {topic}.
                    Indicate if the following paper is relevant: {choice}.
                    Only say yes or no. Please provide your answer below:"""
    answer = llm.invoke(instruction)

    return answer
a = ask_relevant(topic,one)
print(a)

2311.09184
 Yes, the paper is relevant as it also discusses the use of large language models (LLMs) for NLG evaluation and summarization tasks. It provides insights into the challenges and performance gaps of LLMs in instruction controllable text summarization and evaluates various LLM-based automatic evaluation methods.


In [30]:
def ask_relevant(topic,paper):
    choice = paper['title']+paper['summary']
    instruction = f"""Your task is to determine which papers are relevant to the topic: {topic}.
                    Indicate if the following paper is relevant: {choice}.
                    Only say yes or no. Please provide your answer below:"""
    answer = llm.invoke(instruction)

    return answer
def ask_llm_one_by_one():
    answer = []
    for id,paper  in data.items():
        a = ask_relevant(topic,paper)
        print(a)
        if 'Yes' in a:
            answer.append(id)
    return answer
answer1 = ask_llm_one_by_one()
print(answer1)

 Yes, the paper "Shepherd: A Critic for Language Model Generation" is relevant to the topic of LLM-based NLG evaluation as it introduces a language model specifically tuned to critique and suggest refinements, which can be considered as an automatic evaluation method based on LLMs. The paper also discusses the performance of Shepherd in comparison to other models such as ChatGPT, providing insights into the effectiveness of LLM-based NLG evaluation methods.
 No. The paper "LLM-based NLG Evaluation: Current Status and Challenges" is not directly related to the topic of MEMICS (11th Doctoral Workshop on Mathematical and Engineering Methods in Computer Science). The former focuses specifically on natural language generation evaluation using large language models, while the latter is a general call for papers covering various areas of computer science.
 Yes, the paper is relevant as it discusses the use of large language models (LLMs) for NLG evaluation and specifically mentions the fine-t

KeyboardInterrupt: 

In [131]:
evaluate_step(answer1)

(10, 0, 0, 10)

In [27]:
topic
title = arxiv_tool.fetch_paper_titles([paper_id])
title

['LLM-based NLG Evaluation: Current Status and Challenges']

In [31]:
def ask_llm_one_by_one_title():
    answer = []
    for id,paper  in data.items():
        a = ask_relevant(title,paper)
        print(a)
        if 'Yes' in a:
            answer.append(id)
    return answer
answer1 = ask_llm_one_by_one_title()
print(answer1)

 Yes, the paper "Shepherd: A Critic for Language Model Generation" is relevant to the topic of ['LLM-based NLG Evaluation: Current Status and Challenges']. The paper introduces a language model specifically tuned for critiquing responses and suggesting refinements, which relates to the evaluation of large language models in natural language generation (NLG). Additionally, the paper discusses the importance of high quality feedback datasets and compares the performance of Shepherd with other competitive alternatives, which are also relevant topics in the field.
 No. The paper does not seem relevant to the topic of 'LLM-based NLG Evaluation: Current Status and Challenges'. It appears to be an announcement for a doctoral workshop in mathematical and engineering methods in computer science, rather than a research paper on natural language generation (NLG) evaluation using large language models (LLMs).
 Yes, this paper is relevant as it discusses large language models (LLMs) and their evalu

In [32]:
evaluate_step(answer1)

(10, 1, 0, 9)

In [33]:
def ask_relevant_titles(topic,paper):
    choice = paper['title']
    instruction = f"""Your task is to determine which papers are relevant to the topic: {topic}.
                    Indicate if the following paper is relevant: {choice}.
                    Only say yes or no. Please provide your answer below:"""
    answer = llm.invoke(instruction)

    return answer
def ask_llm_one_by_one():
    answer = []
    for id,paper  in data.items():
        a = ask_relevant_titles(title,paper)
        print(a)
        if 'Yes' in a:
            answer.append(id)
    return answer
answertitle = ask_llm_one_by_one()
print(answertitle)

 Yes. The paper "Shepherd: A Critic for Language Model Generation" discusses language model generation, which is related to Natural Language Generation (NLG) and the use of Large Language Models (LLMs). Therefore, it can be considered relevant to the topic.
 Based on the title provided, it is unlikely that the paper "Proceedings 11th Doctoral Workshop on Mathematical and Engineering Methods in Computer Science" is directly relevant to the topic of 'LLM-based NLG Evaluation: Current Status and Challenges'. Therefore, my answer is no.
 No, the paper "Llama 2: Open Foundation and Fine-Tuned Chat Models" does not seem to be directly relevant to the topic of 'LLM-based NLG Evaluation: Current Status and Challenges'. The paper focuses on Llama 2, an open foundation and fine-tuned chat model, while the topic is about the evaluation methods and challenges for Natural Language Generation (NLG) based on Large Language Models (LLMs).
 Yes, the paper "Exploring ChatGPT's Ability to Rank Content: A

In [35]:
evaluate_step(answertitle)
#It makes good guesses: Following 2 are wrong. But they are reasonable based on the information the llm got.
#Yes. The paper may discuss aspects of Natural Language Generation (NLG) evaluation using Large Language Models (LLMs) within the context of Computer Science.
#No, the paper "Llama 2: Open Foundation and Fine-Tuned Chat Models" does not seem to be directly relevant to the topic of 'LLM-based NLG Evaluation: Current Status and Challenges'. The paper focuses on Llama 2, an open foundation and fine-tuned chat model, while the topic is about the evaluation methods and challenges for Natural Language Generation (NLG) based on Large Language Models (LLMs).


(9, 1, 1, 9)

### Over all original papers

In [18]:
# The original state of the art paper
#paper_id = "2402.01383v1"
#paper_id = "2402.06196"
#paper_id = "2408.02304"
#paper_id = "2408.02464"
#paper_id = "2408.02085"
#paper_id = "2311.13731"
#paper_id = "2311.12785"
#paper_id = "2409.15816"
#paper_id = "2409.15180"
paper_id = "2409.09957"
original = get_json('../task3/dataset/'+paper_id+'data')
original['title']
topic = original['title']+': '+ original['abstract']
topic
data = get_json('dataset/'+paper_id)
len(data.keys())

20

In [4]:
def ask_relevant(topic,paper):
    choice = paper['title']+paper['abstract']
    instruction = f"""Your task is to determine which papers are relevant to the topic: {topic}.
                    Indicate if the following paper is relevant: {choice}.
                    Only say yes or no. Please provide your answer below:"""
    answer = llm.invoke(instruction)

    return answer

def evaluate_step(n3):
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for key, value in data.items():
        if key in n3:
            if value['label']==1:
                TP+=1
            else:
                FP+=1
        else:
            if value['label']==1:
                FN+=1
            else:
                TN+=1
    return(TP,FP,FN,TN)
def ask_llm_one_by_one():
    answer = []
    for id,paper  in data.items():
        a = ask_relevant(topic,paper)
        if 'Yes' in a:
            answer.append(id)
    return answer
answer1 = ask_llm_one_by_one()
print(answer1)
TP,FP,FN,TN = evaluate_step(answer1)

In [20]:
# Append line to the file
print(TP,FP,FN,TN)
P = TP/(TP+FP)
R = TP/(TP+FN)
F1 = 2*P*R/(P+R)
with open('results/ask_one_by_one.txt', 'a') as file:
    file.write(f"{paper_id} {P} {R} {F1}\n")

10 0 0 10


In [5]:
#Check one further
#-Analyse one paper with its references. Is the ground truth correct? How good is the llm, are its
#mistakes real mistakes?
paper_id = "2311.13731"
original = get_json('../task3/dataset/'+paper_id+'data')
original['title']
topic = original['title']+': '+ original['abstract']
topic
data = get_json('dataset/'+paper_id)
len(data.keys())

20

In [26]:
(8+0.875+0.9)/10
(5+0.9+0.8+0.7+0.9+0.7)/10
(5+0.9+0.888888888888889+0.7777777777777777+0.9473684210526316+0.8235294117647058)/10

0.9337564499484005

In [22]:
print(topic)
#data['2303.04226']['label']

A Survey of Blockchain, Artificial Intelligence, and Edge Computing for Web 3.0: Web 3.0, as the third generation of the World Wide Web, aims to solve
contemporary problems of trust, centralization, and data ownership. Driven by
the latest advances in cutting-edge technologies, Web 3.0 is moving towards a
more open, decentralized, intelligent, and interconnected network. However,
increasingly widespread data breaches have raised awareness of online privacy
and security of personal data. Additionally, since Web 3.0 is a sophisticated
and complex convergence, the technical details behind it are not as clear as
the characteristics it presents. In this survey, we conduct an in-depth
exploration of Web 3.0 from the perspectives of blockchain, artificial
intelligence, and edge computing. Specifically, we begin with summarizing the
evolution of the Internet and providing an overview of these three key
technological factors. Afterward, we provide a thorough analysis of each
technology separate

In [23]:
# We know from before precision:0.875 recall:0.7   F1:0.778, hence 7 out of 10 papers were found (recall=7/10), given answer were 8 (precision=7/8) 
answer = []
explanation = []
for id,paper  in data.items():
    a = ask_relevant(topic,paper)
    explanation.append(a)
    if 'Yes' in a:
        answer.append(id)
        #Check for false positive:
        # if value['label']==1:

        if data[id]['label']==0:
            print("False positive:")
            print(paper)
            print('-------')
            print(a)
            print('-------')
    else:
        #Check for false negatives:
        if data[id]['label']==1:
            print("False negative")
            print(paper)
            print('---------')
            print(a)
            print('--------')

print(answer)

False positive:
{'title': 'Crypto Makes AI Evolve', 'abstract': 'Adopting cryptography has given rise to a significant evolution in Artificial\nIntelligence (AI). This paper studies the path and stages of this evolution. We\nstart with reviewing existing relevant surveys, noting their shortcomings,\nespecially the lack of a close look at the evolution process and solid future\nroadmap. These shortcomings justify the work of this paper. Next, we identify,\ndefine and discuss five consequent stages in the evolution path, including\nCrypto-Sensitive AI, Crypto-Adapted AI, Crypto-Friendly AI, Crypto-Enabled AI,\nCrypto-Protected AI. Then, we establish a future roadmap for further research\nin this area, focusing on the role of quantum-inspired and bio-inspired AI.', 'date': '2022-06-25T15:04:47+00:00', 'label': 0}
-------
 Yes, the paper "Crypto Makes AI Evolve" is relevant to the topic as it discusses the intersection of artificial intelligence and cryptography, which is one of the key te

### Real case. Don't give a list

In [3]:
# Read the CSV file into a DataFrame
df = pd.read_csv('task2/dataset/'+paper_id+'_all.csv')
all_numbers = list(df['arxiv_id'])
all_numbers

[1606.06565,
 2305.10403,
 2306.04181,
 2308.07201,
 2310.00785,
 2210.11416,
 2309.15217,
 2302.04166,
 2311.00681,
 2312.10355,
 2309.13701,
 2310.00074,
 2303.0761,
 2310.00752,
 2311.18702,
 2310.08491,
 2309.13633,
 2311.00686,
 2310.19792,
 2310.0547,
 2310.1974,
 2307.02762,
 2310.01432,
 2311.09184,
 2305.14239,
 2305.14658,
 2309.13308,
 2307.07889,
 2303.15621,
 2310.15123,
 2206.05802,
 2311.09204,
 2307.09288,
 2305.17926,
 2308.04592,
 2310.11593,
 2306.05087,
 2309.12546,
 2307.03025,
 2307.10928,
 2401.00437,
 2312.15407,
 2308.01862,
 2306.05685,
 2310.17631]

In [50]:
def find_relvant_papers(topic, date,n_papers):
    buffer = 10
    client = arxiv.Client()
    search = arxiv.Search(
    query = topic,
    max_results = n_papers+buffer,
    # sort_by=arxiv.SortCriterion.SubmittedDate
    # sort_by = arxiv.SortCriterion.Relevance #Is default

    )

    results = list(client.results(search))
    # Lists to hold all IDs and their labels
    all_ids = []
    papers = []
    i = 0
    while (len(all_ids)<n_papers) and (i<len(results)):
        result = results[i]
        arxiv_number = result.entry_id.split('/')[-1]
        arxiv_number = arxiv_number.split('v')[0]
        submission_date = result.published
        #check if before the overview
        if (submission_date<date):
            all_ids.append(arxiv_number)
            papers.append(result)

        #print(submission_date)
        #print(result.title)
        i+=1
    return all_ids,papers

In [39]:
#find_relvant_papers(title, date,n_papers)
date = arxiv_tool.fetch_paper_dates([paper_id])[0]
print(date)
title = arxiv_tool.fetch_paper_titles([paper_id])[0]
print(title)


2024-02-02 13:06:35+00:00
LLM-based NLG Evaluation: Current Status and Challenges


In [40]:
predicted,pre_paper = find_relvant_papers(title, date,100)

In [41]:
print(predicted[0])
print(pre_paper[0].title)
print(paper_id)
print(all_numbers[0])

2006.14799
Evaluation of Text Generation: A Survey
2402.01383v1
1606.06565


In [42]:
for p in predicted:
    if p in all_numbers:
        print(1)
if 2006.14799 in all_numbers:
    print(2)

In [43]:
def ask_relevant(topic,choice):
    instruction = f"""Your task is to determine which papers are relevant to the topic: {topic}.
                    Indicate if the following paper is relevant: {choice}.
                    Only say yes or no. Please provide your answer below:"""
    answer = llm.invoke(instruction)

    return answer

Conclusion: arxiv gave 10 papers related to the topic, the first one was the original which was removed by the date (as it has to be strictly older). None of the proposed paper are in the references of the original papers. Arxiv search with matching keywords. 
Asking the llm all of these proposed papers are relevant.

In [44]:
for p in pre_paper:
    answer = []
    a = ask_relevant(title,p.summary)
    #print(a)
    if 'Yes' in a:
        answer.append(title)
    else:
         print(a) 
# No, this paper is not relevant to LLM-based NLG Evaluation: Current Status and Challenges as it focuses on Nonlocal Gravity (NLG) in the context of classical physics, specifically gravitation, rather than Natural Language Generation (NLG) evaluation.
# No, this paper is not directly relevant to LLM-based NLG Evaluation: Current Status and Challenges. The paper focuses on the automatic extraction of subgrammars for controlling and speeding up NLG using explanation-based learning (EBL), rather than evaluating or discussing challenges related to LLM-based NLG evaluation specifically.
# No, the paper is not directly relevant to LLM-based NLG Evaluation: Current Status and Challenges. The paper focuses on Abductive Reasoning and benchmarks like aNLI and aNLG, but it does not specifically discuss LLMs or Natural Language Generation evaluation methods.
# No, the paper does not seem relevant to LLM-based NLG Evaluation: Current Status and Challenges. The paper focuses on Nonlocal Gravity (NLG) in astrophysics and cosmology, specifically discussing its implications for effective dark matter in three ultra-diffuse galaxies. There is no mention or relevance to LLM-based Natural Language Generation (NLG) evaluation or current challenges in this area.
# No. The paper does not seem relevant to the topic of LLM-based NLG Evaluation: Current Status and Challenges as it does not discuss any research related to evaluation, current status, or challenges in NLG using LLMs.
# Based on the title and abstract provided, it seems that this paper focuses on using NLG (Natural Language Generation) to document eBusiness models, rather than evaluating LLM-based NLG specifically. Therefore, I would classify this paper as not directly relevant to the topic of "LLM-based NLG Evaluation: Current Status and Challenges."

 No, this paper is not relevant to LLM-based NLG Evaluation: Current Status and Challenges as it focuses on Nonlocal Gravity (NLG) in the context of classical physics, specifically gravitation, rather than Natural Language Generation (NLG) evaluation.
 No, this paper is not directly relevant to LLM-based NLG Evaluation: Current Status and Challenges. The paper focuses on the automatic extraction of subgrammars for controlling and speeding up NLG using explanation-based learning (EBL), rather than evaluating or discussing challenges related to LLM-based NLG evaluation specifically.
 No, the paper is not directly relevant to LLM-based NLG Evaluation: Current Status and Challenges. The paper focuses on Abductive Reasoning and benchmarks like aNLI and aNLG, but it does not specifically discuss LLMs or Natural Language Generation evaluation methods.
 No, the paper does not seem relevant to LLM-based NLG Evaluation: Current Status and Challenges. The paper focuses on Nonlocal Gravity (NLG) i

Out of 100 papers from arxiv related to the topic :LLM-based NLG Evaluation: Current Status and Challenges
The llm filters out 6 papers. It was able to filter out papers that are there by mistake, for example it was there by mismatching the abreviation NLG (natural language genereation) with NLG (nonlocal gravity). However these were far in the list of arxiv, it is also very capable of selecting relevant papers.

In [54]:
def find_relvant_papers(topic, date,n_papers):
    buffer = 10
    client = arxiv.Client()
    search = arxiv.Search(
    query = topic,
    max_results = n_papers+buffer,
    # sort_by=arxiv.SortCriterion.SubmittedDate
    # sort_by = arxiv.SortCriterion.Relevance #Is default

    )

    results = list(client.results(search))
    print(len(results))
    # Lists to hold all IDs and their labels
    all_ids = []
    papers = []
    i = 0
    while (len(all_ids)<n_papers) and (i<len(results)):
        result = results[i]
        arxiv_number = result.entry_id.split('/')[-1]
        arxiv_number = arxiv_number.split('v')[0]
        submission_date = result.published
        #check if before the overview
        if (submission_date<date):
            all_ids.append(arxiv_number)
            papers.append(result)

        #print(submission_date)
        #print(result.title)
        i+=1
    return all_ids,papers

In [55]:
print(topic)
predicted_topic,pre_paper_topic = find_relvant_papers(topic, date,50)


LLM-based NLG Evaluation: Current Status and Challenges: Evaluating natural language generation (NLG) is a vital but challenging
problem in artificial intelligence. Traditional evaluation metrics mainly
capturing content (e.g. n-gram) overlap between system outputs and references
are far from satisfactory, and large language models (LLMs) such as ChatGPT
have demonstrated great potential in NLG evaluation in recent years. Various
automatic evaluation methods based on LLMs have been proposed, including
metrics derived from LLMs, prompting LLMs, and fine-tuning LLMs with labeled
evaluation data. In this survey, we first give a taxonomy of LLM-based NLG
evaluation methods, and discuss their pros and cons, respectively. We also
discuss human-LLM collaboration for NLG evaluation. Lastly, we discuss several
open problems in this area and point out future research directions.
0


In [53]:
len(predicted_topic)

0

Here we see that to give the abstract of the original paper is too long and complex that the arxiv finds any papers. Maybe this case has to be taken care of when it happens
and the topic has to be compressed

Unused

In [None]:
def select_with_summary(topic,summaries):
    instruction = "Indicate all relevant papers to the topic: "+topic+". Which papers of the following are relevant?:"
    choice = "/n".join(["**"+str(n)+"** "+i['title']+i['summary'] for n,i in enumerate(summaries)])
    clearification = "write the number of the paper of the numbers i gave you. Give only the number of releveant papers. Don't use any other numbers. Don't give any explanation. Give all the relevant papers."
    #answer = llm.invoke(instruction + choice + clearification)
    answer = llm.invoke(f"""Your task is to determine which papers are relevant to the topic: {topic}.

Below is a list of papers. Each paper is numbered and includes its title and summary.

List of Papers:
“”"
{choice}
“”"

Instructions:

    Indicate which papers are relevant to the topic by writing only the numbers of the relevant papers.
    Do not provide any explanations or use any other numbers.
    Format your answer as a list of numbers separated by commas (e.g., 1, 3, 5).

Please provide your answer below:""")

    return answer

In [None]:
def evaluate(answer,data):
    n3 = re.findall(r'\d{4}\.\d{5}',answer)
    #print(n3)
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for key, value in data.items():
        if key in n3:
            if value['label']==1:
                TP+=1
            elif value['label']==0:
                FP+=1
            else:
                print("problem")
        else:
            if value['label']==1:
                FN+=1
            else:
                TN+=1
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    print("out of ",TP+FP," predictions are ",TP," correct.")
    print("out of ",TP+FN," related papers were ",TP," found.")
    F1 = 2*precision*recall/(precision+recall)
    return(precision,recall,F1)

In [None]:
#shuffle the list of summaries given to the llm 10 times. How big is difference?
scores = []
#answer3 = select_with_summary_id(topic,data) previously executed
P,R,F1 = evaluate(answer3,data)
scores.append([round(float(P),5),round(float(R),5),round(float(F1),5)])
for i in range(9):
    #shuffle data
    items = list(data.items())    
    random.shuffle(items)
    shuffled_dict = dict(items)
    answer = select_with_summary_id(topic,shuffled_dict)
    P, R, F1 = evaluate(answer, shuffled_dict)
    scores.append([round(float(P),5),round(float(R),5),round(float(F1),5)])

m = np.mean(scores,axis=0)
s = np.std(scores,axis=0)
scores.append(m)
scores.append(s)
print(np.matrix(scores))
filename = 'title_and_summaries'
np.savetxt('task2/results/'+filename+'.txt',scores,fmt='%.4f')
"""out of  4  predictions are  4  correct.
out of  10  related papers were  4  found.
out of  4  predictions are  4  correct.
out of  10  related papers were  4  found.
out of  5  predictions are  2  correct.
out of  10  related papers were  2  found.
out of  6  predictions are  2  correct.
out of  10  related papers were  2  found.
out of  5  predictions are  3  correct.
out of  10  related papers were  3  found.
out of  6  predictions are  1  correct.
out of  10  related papers were  1  found.
out of  5  predictions are  3  correct.
out of  10  related papers were  3  found.
out of  6  predictions are  3  correct.
out of  10  related papers were  3  found.
out of  6  predictions are  4  correct.
out of  10  related papers were  4  found.
out of  6  predictions are  2  correct.
out of  10  related papers were  2  found."""


In [None]:
filename = 'title_and_summaries'
np.savetxt('task2/results/'+filename+'_rouge.txt',scores,fmt='%.4f')
#The last two rows are metrics for the values before. We see that the recall is pretty low in all the combinations. Meaning the llm found only few sources 
# and this pretty much regardless of how many it suggested, see the standard deviationof the recall in last line and see that the nummber of correct sources 
# are between 2 and 4 with 1 beeing an exception. 
# The precision on the other hand does vary quite much, it has a higher standard deviation (of 0.26238). The precision is lower when it suggests more papers.

In [None]:
#Are the suggestions overlapping or are they the same?
scores = []
#answer3 = select_with_summary_id(topic,data) previously executed
P,R,F1 = evaluate(answer3,data)
scores.append([round(float(P),5),round(float(R),5),round(float(F1),5)])
all_answers = answer3
for i in range(9):
    #shuffle data
    items = list(data.items())    
    random.shuffle(items)
    shuffled_dict = dict(items)
    answer = select_with_summary_id(topic,shuffled_dict)
    P, R, F1 = evaluate(answer, shuffled_dict)
    scores.append([round(float(P),5),round(float(R),5),round(float(F1),5)])
    all_answer = all_answers+answer

m = np.mean(scores,axis=0)
s = np.std(scores,axis=0)
scores.append(m)
scores.append(s)
print(np.matrix(scores))
print(all_answers)
