## Assessing Discussions of LLM Limitations: Multi-Prompt Evaluation with GPT 
This script employs five unique prompting techniques, including zero-shot and few-shot approaches, to guide GPT-4.o in evaluating the extent of LLM limitations discussed in academic papers. It aims to precisely rate each paper based on the thoroughness of its discussion on LLM limitations, using targeted prompts to achieve an in-depth analysis.

### Prompt1: Standard Baseline Evaluation Prompt

In [None]:
import openai
import json
import csv
from tqdm import tqdm

openai.api_key = 'Place your API Key here'

def evaluate_paper_baseline(title, summary):
    prompt = f"""
    Can you please let me know whether the following paper is about language models (e.g., LMs or LLMs) and whether it talks about their limitations? If so, please indicate the parts in the abstract or title it does so. Please be brief in your explanations. Note that LMs and LLMs include pre-trained transformer based language models and multimodal, visual language models. Please include *all* kinds of language models but *no* other, more general models in your classification.

    Answer in the following way:
    LMs: [yes/no].
    Limitations of LMs: [rating from 1-5].
    Evidence: [the evidence text in the abstract or title].

    Title: {title}
    Paper: {summary}
    """

    response = openai.ChatCompletion.create(
        model="gpt-4o-2024-05-13",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        n=1,
        stop=None,
        temperature=0.5
    )

    return response.choices[0].message['content'].strip()

def read_papers_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        papers = json.load(file)
    return papers

file_path = '10_gold_standard_papers.json'

csv_file_path = 'baseline_10_results.csv'
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'LMs', 'Limitations of LMs', 'Evidence'])

    papers = read_papers_from_json(file_path)

    for paper in tqdm(papers, desc="Processing papers"):
        title = paper.get('title', 'No Title')
        summary = paper.get('summary', 'No Summary')
        evaluation_result = evaluate_paper_baseline(title, summary)
        print(f"Raw evaluation result:\n{evaluation_result}\n")

        lines = evaluation_result.split('\n')
        talks_about_llms = ''
        rate = ''
        evidence = ''

        for line in lines:
            line = line.strip()
            if line.startswith('LMs:'):
                talks_about_llms = line.split(':')[1].strip().strip('[]')
            elif line.startswith('Limitations of LMs:'):
                rate = line.split(':')[1].strip().strip('[]')
            elif line.startswith('Evidence:'):
                evidence = line.split(':', 1)[1].strip().strip('[]')

        writer.writerow([title, talks_about_llms, rate, evidence])
        print(f"Title: {title}\nEvaluation Result: {evaluation_result}\n")

        print(f"Parsed values:\nTitle: {title}\nLMs: {talks_about_llms}\nLimitations of LMs: {rate}\nEvidence: {evidence}\n")

### Prompt2: Enhanced Instructional Detail Prompt

In [1]:
import openai
import json
import csv
from tqdm import tqdm

openai.api_key = ' Enter your API key here'

def evaluate_paper_baseline(title, summary):
    prompt = f"""
    Can you please let me know whether the following paper is about language models (e.g., LMs or LLMs) and whether it talks about their limitations? If so, please indicate the parts in the abstract or title it does so. Please be brief in your evidence. Note that LMs and LLMs include pre-trained transformer based language models and multimodal, visual language models. Please include *all* kinds of language models but *no* other, more general models in your classification.

    Based on the following rules, rate the abstract from 1-5 based on how thoroughly it discusses the limitations or challenges related to LLMs:
    1: Does not talk about LLMs at all or mention any limitation of LLMs.
    2: Mentions one limitation of LLMs very briefly.
    3: Mentions limitations of LLMs, but they are not the focus of the abstract.The limitations are discussed superficially or as secondary points.
    4: Provides multiple limitations of LLMs. The limitations are significant and discussed in details but alongside other topics.
    5: The entire abstract or most of the sentences focus on the limitations and challenges of LLMs. Sentences discuss limitations in detail, with strong wording indicating serious issues.


    Please answer in the following format by providing the rating and a brief evidence for each abstract.

    Answer in the following way:
    Does it talk about LLMs: [yes/no].
    Rate Limitations of LLMs: [rating from 1-5].
    Evidence: [the evidence text in the abstract or title].

    Title: {title}
    Paper: {summary}
    """

    response = openai.ChatCompletion.create(
        model="gpt-4o-2024-05-13",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        n=1,
        stop=None,
        temperature=0.5
    )

    return response.choices[0].message['content'].strip()

def read_papers_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        papers = json.load(file)
    return papers

file_path = '10_gold_standard_papers.json'

csv_file_path = 'my_prompt_10_results.csv'
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Talks about LLMs', 'Rate', 'Evidence'])

    papers = read_papers_from_json(file_path)

    for paper in tqdm(papers, desc="Processing papers"):
        title = paper.get('title', 'No Title')
        summary = paper.get('summary', 'No Summary')
        evaluation_result = evaluate_paper_baseline(title, summary)

        # parsing the response
        lines = evaluation_result.split('\n')
        talks_about_llms = ''
        rate = ''
        evidence = ''
        
        for line in lines:
            if line.startswith('Does it talk about LLMs:'):
                talks_about_llms = line.split(':')[1].strip().strip('[]')
            elif line.startswith('Rate Limitations of LLMs:'):
                rate = line.split(':')[1].strip().strip('[]')
            elif line.startswith('Evidence:'):
                evidence = line.split(':')[1].strip().strip('[]')

        # results written on csv file 
        writer.writerow([title, talks_about_llms, rate, evidence])
        print(f"Title: {title}\nEvaluation Result: {evaluation_result}\n")


Processing papers:   3%|██▏                                                             | 1/29 [00:01<00:55,  1.97s/it]

Title: Directed Acyclic Transformer Pre-training for High-quality Non-autoregressive Text Generation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "existing NAR models lack proper pre-training, making them still far behind the pre-trained autoregressive models" and "alleviates the error accumulation problem in autoregressive generation."



Processing papers:   7%|████▍                                                           | 2/29 [00:02<00:36,  1.36s/it]

Title: Time-and-Space-Efficient Weighted Deduction
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract and title discuss weighted deduction in NLP algorithms, topological sorting, and dynamic programming but do not mention language models or their limitations.



Processing papers:  10%|██████▌                                                         | 3/29 [00:04<00:37,  1.43s/it]

Title: Conditional Generation with a Question-Answering Blueprint
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "neural seq-to-seq models whose outputs often reveal hallucinations and fail to correctly cover important details" and "rendering conditional generation less opaque and more grounded".



Processing papers:  14%|████████▊                                                       | 4/29 [00:08<00:56,  2.28s/it]

Title: Collective Human Opinions in Semantic Textual Similarity
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on semantic textual similarity (STS) and human annotation disagreements. It does not mention language models (LMs or LLMs) or their limitations. The discussion is centered on the variance in human opinions and the inadequacy of current STS models to capture this variance, without specifying any particular type of language model.



Processing papers:  17%|███████████                                                     | 5/29 [00:09<00:44,  1.87s/it]

Title: Design Choices for Crowdsourcing Implicit Discourse Relations: Revealing the Biases Introduced by Task Design
Evaluation Result: Does it talk about LLMs: no.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on biases introduced by task design in crowdsourced linguistic annotations, particularly for implicit discourse relation annotation. It does not mention language models (LMs or LLMs) or their limitations.



Processing papers:  21%|█████████████▏                                                  | 6/29 [00:10<00:39,  1.74s/it]

Title: Communication Drives the Emergence of Language Universals in Neural Agents: Evidence from the Word-order/Case-marking Trade-off
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses neural agent-based simulations of language emergence and change, focusing on the word-order/case-marking trade-off and the development of a new framework (NeLLCom) for language learning and communication. It does not mention large language models (LLMs) or their limitations.



Processing papers:  24%|███████████████▍                                                | 7/29 [00:11<00:34,  1.55s/it]

Title: A Cross-Linguistic Pressure for Uniform Information Density in Word Order
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract and title focus on the uniform information density hypothesis and word order patterns in natural languages. There is no mention of language models (LMs or LLMs) or their limitations.



Processing papers:  28%|█████████████████▋                                              | 8/29 [00:12<00:29,  1.38s/it]

Title: Cross-functional Analysis of Generalization in Behavioral Learning
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract and title do not mention language models (LMs or LLMs) or their limitations. The focus is on behavioral learning and generalization in NLP tasks.



Processing papers:  31%|███████████████████▊                                            | 9/29 [00:14<00:32,  1.60s/it]

Title: Exploring Contrast Consistency of Open-Domain Question Answering Systems on Minimally Edited Questions
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We find that the widely used dense passage retriever (DPR) performs poorly on our contrast sets, despite fitting the training set well and performing competitively on standard test sets."



Processing papers:  34%|█████████████████████▋                                         | 10/29 [00:16<00:29,  1.57s/it]

Title: Compositional Zero-Shot Domain Transfer with Text-to-Text Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses a novel compositional transfer learning framework (DoT5) for zero-shot domain transfer and its effectiveness in the biomedical domain. However, it does not mention any limitations or challenges related to LLMs.



Processing papers:  38%|███████████████████████▉                                       | 11/29 [00:17<00:26,  1.46s/it]

Title: MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract and title focus on a multilingual retrieval dataset and do not mention language models (LMs or LLMs) or their limitations. The discussion is centered on the dataset's creation, scope, and purpose.



Processing papers:  41%|██████████████████████████                                     | 12/29 [00:18<00:23,  1.39s/it]

Title: DMDD: A Large-Scale Dataset for Dataset Mentions Detection
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract and title focus on dataset mention detection in scientific literature and do not discuss language models (LMs, LLMs) or their limitations.



Processing papers:  45%|████████████████████████████▏                                  | 13/29 [00:21<00:26,  1.68s/it]

Title: T3L: Translate-and-Test Transfer Learning for Cross-Lingual Text Classification
Evaluation Result: Does it talk about LLMs: yes.
Rate Limitations of LLMs: 3.
Evidence: "Nowadays, cross-lingual text classifiers are typically built on large-scale, multilingual language models (LMs) pretrained on a variety of languages of interest. However, the performance of these models varies significantly across languages and classification tasks, suggesting that the superposition of the language modelling and classification tasks is not always effective."



Processing papers:  48%|██████████████████████████████▍                                | 14/29 [00:23<00:27,  1.84s/it]

Title: Introduction to Mathematical Language Processing: Informal Proofs, Word Problems, and Supporting Tasks
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses "mathematical language processing methods" and specific tasks such as identifier-definition extraction, formula retrieval, and informal theorem proving. It does not mention language models (LMs or LLMs) or their limitations.



Processing papers:  52%|████████████████████████████████▌                              | 15/29 [00:24<00:23,  1.67s/it]

Title: Evaluating a Century of Progress on the Cognitive Science of Adjective Ordering
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on the cognitive science of adjective ordering and the use of natural language processing technologies to evaluate proposals across multiple languages. It does not mention language models (LMs or LLMs) or their limitations.



Processing papers:  55%|██████████████████████████████████▊                            | 16/29 [00:26<00:20,  1.60s/it]

Title: Improving Multitask Retrieval by Promoting Task Specialization
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention language models (LMs or LLMs) or discuss their limitations. It focuses on multitask retrieval and task specialization.



Processing papers:  59%|████████████████████████████████████▉                          | 17/29 [00:27<00:18,  1.57s/it]

Title: Calibrated Interpretation: Confidence Estimation in Semantic Parsing
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on sequence generation models used for semantic parsing, calibration, and confidence estimation. There is no mention of language models (LMs or LLMs) or their limitations.



Processing papers:  62%|███████████████████████████████████████                        | 18/29 [00:29<00:16,  1.53s/it]

Title: Intent-calibrated Self-training for Answer Selection in Open-domain Dialogues
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention language models (LMs or LLMs) or discuss their limitations. The focus is on answer selection in open-domain dialogues and the introduction of an intent-calibrated self-training paradigm, without any reference to language models.



Processing papers:  66%|█████████████████████████████████████████▎                     | 19/29 [00:30<00:14,  1.42s/it]

Title: Benchmarking the Generation of Fact Checking Explanations
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention language models (LMs or LLMs) or their limitations. It focuses on the generation of fact-checking explanations and benchmarking with datasets and summarization approaches.



Processing papers:  69%|███████████████████████████████████████████▍                   | 20/29 [00:31<00:12,  1.42s/it]

Title: T 2 -NER: A Two-Stage Span-Based Framework for Unified Named Entity Recognition with Templates
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract and title focus on a two-stage span-based framework for Named Entity Recognition (NER) and do not mention language models (LMs or LLMs) or their limitations. The content is centered on NER tasks and methodologies, with no discussion of LLM-related challenges.



Processing papers:  72%|█████████████████████████████████████████████▌                 | 21/29 [00:32<00:10,  1.37s/it]

Title: PASTA: A Dataset for Modeling PArticipant STAtes in Narratives
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Experiments show that today’s LLMs can reason about states to some degree, but there is large room for improvement, especially in problems requiring access and ability to reason with diverse types of knowledge (e.g., physical, numerical, factual)."



Processing papers:  76%|███████████████████████████████████████████████▊               | 22/29 [00:34<00:09,  1.33s/it]

Title: U-CORE: A Unified Deep Cluster-wise Contrastive Framework for Open Relation Extraction
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract and title focus on Open Relation Extraction (ORE) tasks and propose a framework called U-CORE. There is no mention of language models (LMs or LLMs) or their limitations.



Processing papers:  79%|█████████████████████████████████████████████████▉             | 23/29 [00:35<00:08,  1.48s/it]

Title: In-Context Retrieval-Augmented Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "they can mitigate the problem of factually inaccurate text generation and provide natural source attribution mechanism" and "Existing RALM approaches focus on modifying the LM architecture in order to facilitate the incorporation of external information, significantly complicating deployment."

The abstract mentions limitations related to factually inaccurate text generation and the complexity of modifying LM architectures but does not focus extensively on these issues.



Processing papers:  83%|████████████████████████████████████████████████████▏          | 24/29 [00:38<00:08,  1.77s/it]

Title: Learning to Paraphrase Sentences to Different Complexity Levels
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Finally, we establish how a handful of Large Language Models perform on these tasks under a zero-shot setting."

The abstract mentions the performance of Large Language Models (LLMs) in a zero-shot setting but does not discuss any limitations or challenges related to LLMs.



Processing papers:  86%|██████████████████████████████████████████████████████▎        | 25/29 [00:39<00:06,  1.61s/it]

Title: Direct Speech Translation for Automatic Subtitling
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses automatic subtitling and proposes a direct speech translation model, but it does not mention language models (LMs or LLMs) or any limitations related to them.



Processing papers:  90%|████████████████████████████████████████████████████████▍      | 26/29 [00:42<00:05,  1.85s/it]

Title: How Abstract Is Linguistic Generalization in Large Language Models? Experiments with Argument Structure
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "LLMs fail at generalizations between related contexts that have not been observed during pre-training... LLMs show a bias to generalize based on linear order. This finding points to a limitation with current models and points to a reason for which their training is data-intensive."



Processing papers:  93%|██████████████████████████████████████████████████████████▋    | 27/29 [00:43<00:03,  1.84s/it]

Title: Multi 3 WOZ: A Multilingual, Multi-Domain, Multi-Parallel Dataset for Training and Evaluating Culturally Adapted Task-Oriented Dialog Systems
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses the creation and evaluation of a multilingual, multi-domain, and multi-parallel dataset for task-oriented dialog systems. It focuses on the challenges of creating culturally adapted datasets and does not mention language models (LMs or LLMs) or their limitations.



Processing papers:  97%|████████████████████████████████████████████████████████████▊  | 28/29 [00:46<00:01,  1.95s/it]

Title: Can Authorship Representation Learning Capture Stylistic Features?
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses authorship representation learning and its ability to capture stylistic features, but it does not mention language models (LMs or LLMs) or any limitations associated with them. The focus is on authorship attribution and writing style.



Processing papers: 100%|███████████████████████████████████████████████████████████████| 29/29 [00:47<00:00,  1.62s/it]

Title: Optimal Transport Posterior Alignment for Cross-lingual Semantic Parsing
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on cross-lingual semantic parsing and methods to improve it using Optimal Transport. It does not mention language models (LMs or LLMs) or discuss their limitations.






### Prompt3: Categorical Instruction Grouping Prompt

In [8]:
import openai
import json
import csv
from tqdm import tqdm

openai.api_key = 'Enter your API key here'

def evaluate_paper_baseline(title, summary):
    prompt = f"""
    Please evaluate the following paper to determine if it discusses language models (e.g., LMs or LLMs) and whether it addresses their limitations. If it does, indicate the relevant parts of the abstract or title. Note that LMs and LLMs include pre-trained transformer-based language models and multimodal, visual language models. Include all kinds of language models but exclude other, more general models.

    Please rate the papers from 1 to 5:
    - **1:** The abstract does not talk about large language models at all, or even if it talks about 
    LLMs, it does not mention any limitation of them.
    - **2-3:** The abstract mentions just a few limitations of Large Language Models; they are 
    mentioned as secondary points.
    - **4-5:** The abstract explicitly talks a lot about the limitations of Large Language Models 
    and discusses them in detail or it uses strong wording.

    Please answer in the following format by providing the rating and a brief evidence for 
    each abstract.

    Answer in the following way:
    Does it talk about LLMs: [yes/no].
    Rate Limitations of LLMs: [rating from 1-5].
    Evidence: [the evidence text in the abstract or title].

    Title: {title}
    Paper: {summary}
    """

    response = openai.ChatCompletion.create(
        model="gpt-4o-2024-05-13",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        n=1,
        stop=None,
        temperature=0.5
    )

    return response.choices[0].message['content'].strip()

def read_papers_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        papers = json.load(file)
    return papers

file_path = '10_gold_standard_papers.json'

csv_file_path = 'my_simpler_prompt_10_results.csv'
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Talks about LLMs', 'Rate', 'Evidence'])

    papers = read_papers_from_json(file_path)

    for paper in tqdm(papers, desc="Processing papers"):
        title = paper.get('title', 'No Title')
        summary = paper.get('summary', 'No Summary')
        evaluation_result = evaluate_paper_baseline(title, summary)

        # parsing the response
        lines = evaluation_result.split('\n')
        talks_about_llms = ''
        rate = ''
        evidence = ''
        
        for line in lines:
            if line.startswith('Does it talk about LLMs:'):
                talks_about_llms = line.split(':')[1].strip().strip('[]')
            elif line.startswith('Rate Limitations of LLMs:'):
                rate = line.split(':')[1].strip().strip('[]')
            elif line.startswith('Evidence:'):
                evidence = line.split(':')[1].strip().strip('[]')

        # results written on csv file 
        writer.writerow([title, talks_about_llms, rate, evidence])
        print(f"Title: {title}\nEvaluation Result: {evaluation_result}\n")


Processing papers:   3%|██▏                                                             | 1/29 [00:02<01:00,  2.15s/it]

Title: Directed Acyclic Transformer Pre-training for High-quality Non-autoregressive Text Generation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "existing NAR models lack proper pre-training, making them still far behind the pre-trained autoregressive models" and "Further analysis shows that PreDAT benefits from the unbiased prediction order that alleviates the error accumulation problem in autoregressive generation."

The abstract mentions limitations of existing NAR models compared to pre-trained autoregressive models and discusses the error accumulation problem in autoregressive generation. However



Processing papers:   7%|████▍                                                           | 2/29 [00:03<00:45,  1.69s/it]

Title: Time-and-Space-Efficient Weighted Deduction
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models or their limitations. It focuses on weighted deduction strategies in NLP algorithms and their efficiency in terms of time and space.



Processing papers:  10%|██████▌                                                         | 3/29 [00:04<00:39,  1.52s/it]

Title: Conditional Generation with a Question-Answering Blueprint
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: The abstract mentions that "neural seq-to-seq models...often reveal hallucinations and fail to correctly cover important details," indicating some limitations of these models. However, these limitations are mentioned as secondary points and not discussed in detail.



Processing papers:  14%|████████▊                                                       | 4/29 [00:06<00:37,  1.48s/it]

Title: Collective Human Opinions in Semantic Textual Similarity
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models (LLMs) or their limitations. It focuses on semantic textual similarity (STS) and the introduction of an uncertainty-aware dataset for STS. 

The discussion is centered around human opinions and the performance of current STS models, without any reference to LLMs or their limitations.



Processing papers:  17%|███████████                                                     | 5/29 [00:07<00:33,  1.41s/it]

Title: Design Choices for Crowdsourcing Implicit Discourse Relations: Revealing the Biases Introduced by Task Design
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on biases introduced by task design in crowdsourced linguistic annotations, specifically in the context of implicit discourse relation annotation. It does not discuss large language models or their limitations.



Processing papers:  21%|█████████████▏                                                  | 6/29 [00:09<00:39,  1.72s/it]

Title: Communication Drives the Emergence of Language Universals in Neural Agents: Evidence from the Word-order/Case-marking Trade-off
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on neural agents and their simulations of language emergence and change, specifically addressing the word-order/case-marking trade-off. It does not mention large language models or their limitations.



Processing papers:  24%|███████████████▍                                                | 7/29 [00:11<00:34,  1.56s/it]

Title: A Cross-Linguistic Pressure for Uniform Information Density in Word Order
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses the uniform information density hypothesis and word order patterns in natural languages. There is no mention of large language models or their limitations.



Processing papers:  28%|█████████████████▋                                              | 8/29 [00:12<00:30,  1.46s/it]

Title: Cross-functional Analysis of Generalization in Behavioral Learning
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models (LLMs) or their limitations. The focus is on behavioral learning, generalization, and evaluation methods for NLP tasks, but there is no specific mention of LLMs or their limitations.



Processing papers:  31%|███████████████████▊                                            | 9/29 [00:14<00:35,  1.77s/it]

Title: Exploring Contrast Consistency of Open-Domain Question Answering Systems on Minimally Edited Questions
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Our collection approach combines both human annotation and large language model generation. We find that the widely used dense passage retriever (DPR) performs poorly on our contrast sets, despite fitting the training set well and performing competitively on standard test sets." 

The abstract mentions the use of large language models for generating questions and highlights a limitation in the performance of a specific model (DPR



Processing papers:  34%|█████████████████████▋                                         | 10/29 [00:16<00:30,  1.60s/it]

Title: Compositional Zero-Shot Domain Transfer with Text-to-Text Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: The abstract mentions the use of a masked language model and discusses a novel compositional transfer learning framework. However, it does not discuss any limitations of large language models. The focus is on the proposed method's effectiveness and performance improvements.



Processing papers:  38%|███████████████████████▉                                       | 11/29 [00:18<00:34,  1.93s/it]

Title: MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses a multilingual retrieval dataset for ad hoc retrieval tasks across various languages and emphasizes the creation and quality control of the dataset. It does not mention large language models (LLMs) or their limitations.



Processing papers:  41%|██████████████████████████                                     | 12/29 [00:19<00:29,  1.72s/it]

Title: DMDD: A Large-Scale Dataset for Dataset Mentions Detection
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models (LLMs) or their limitations. It focuses on dataset mention detection and the introduction of a new dataset for this task.



Processing papers:  45%|████████████████████████████▏                                  | 13/29 [00:21<00:24,  1.54s/it]

Title: T3L: Translate-and-Test Transfer Learning for Cross-Lingual Text Classification
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, the performance of these models varies significantly across languages and classification tasks, suggesting that the superposition of the language modelling and classification tasks is not always effective."



Processing papers:  48%|██████████████████████████████▍                                | 14/29 [00:22<00:20,  1.37s/it]

Title: Introduction to Mathematical Language Processing: Informal Proofs, Word Problems, and Supporting Tasks
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses mathematical language processing methods and their sub-areas but does not mention large language models (LLMs) or their limitations.



Processing papers:  52%|████████████████████████████████▌                              | 15/29 [00:23<00:18,  1.36s/it]

Title: Evaluating a Century of Progress on the Cognitive Science of Adjective Ordering
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on the cognitive science of adjective ordering and evaluates various proposals in this area using natural language processing technologies and datasets. It does not mention large language models or their limitations.



Processing papers:  55%|██████████████████████████████████▊                            | 16/29 [00:24<00:16,  1.27s/it]

Title: Improving Multitask Retrieval by Promoting Task Specialization
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses multitask retrieval and the methods to improve it, but it does not mention large language models or their limitations.



Processing papers:  59%|████████████████████████████████████▉                          | 17/29 [00:25<00:15,  1.32s/it]

Title: Calibrated Interpretation: Confidence Estimation in Semantic Parsing
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: The abstract mentions "Sequence generation models" which include pre-trained transformer-based language models. It discusses the calibration of these models and indicates that there are variations in calibration error across models and datasets. However, the limitations of these models are mentioned as secondary points, focusing mainly on the aspect of calibration rather than a broad discussion of limitations.



Processing papers:  62%|███████████████████████████████████████                        | 18/29 [00:26<00:13,  1.25s/it]

Title: Intent-calibrated Self-training for Answer Selection in Open-domain Dialogues
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on answer selection in open-domain dialogues and introduces a self-training paradigm using intent labels. It does not mention large language models (LLMs) or their limitations.



Processing papers:  66%|█████████████████████████████████████████▎                     | 19/29 [00:28<00:12,  1.24s/it]

Title: Benchmarking the Generation of Fact Checking Explanations
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models (LLMs) or any limitations related to them. It focuses on the generation of fact-checking explanations and summarization approaches without discussing LLMs or their limitations.



Processing papers:  69%|███████████████████████████████████████████▍                   | 20/29 [00:29<00:10,  1.16s/it]

Title: T 2 -NER: A Two-Stage Span-Based Framework for Unified Named Entity Recognition with Templates
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on a two-stage span-based framework for unified Named Entity Recognition (NER) and does not mention large language models or their limitations.



Processing papers:  72%|█████████████████████████████████████████████▌                 | 21/29 [00:30<00:09,  1.21s/it]

Title: PASTA: A Dataset for Modeling PArticipant STAtes in Narratives
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Experiments show that today’s LLMs can reason about states to some degree, but there is large room for improvement, especially in problems requiring access and ability to reason with diverse types of knowledge (e.g., physical, numerical, factual)."



Processing papers:  76%|███████████████████████████████████████████████▊               | 22/29 [00:31<00:08,  1.18s/it]

Title: U-CORE: A Unified Deep Cluster-wise Contrastive Framework for Open Relation Extraction
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models or their limitations. It focuses on a novel framework for Open Relation Extraction using Contrastive Learning and Clustering techniques.



Processing papers:  79%|█████████████████████████████████████████████████▉             | 23/29 [00:33<00:07,  1.33s/it]

Title: In-Context Retrieval-Augmented Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "In addition, they can mitigate the problem of factually inaccurate text generation and provide natural source attribution mechanism." This mentions a limitation (factually inaccurate text generation) but does not delve deeply into it within the abstract. The primary focus is on the method and its benefits rather than an in-depth discussion of limitations.



Processing papers:  83%|████████████████████████████████████████████████████▏          | 24/29 [00:34<00:06,  1.36s/it]

Title: Learning to Paraphrase Sentences to Different Complexity Levels
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Finally, we establish how a handful of Large Language Models perform on these tasks under a zero-shot setting."

The abstract mentions the performance of Large Language Models in a zero-shot setting but does not discuss their limitations.



Processing papers:  86%|██████████████████████████████████████████████████████▎        | 25/29 [00:35<00:05,  1.26s/it]

Title: Direct Speech Translation for Automatic Subtitling
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models (LLMs) or their limitations. It focuses on a direct speech translation model for automatic subtitling, discussing its performance and comparison with other systems.



Processing papers:  90%|████████████████████████████████████████████████████████▍      | 26/29 [00:37<00:04,  1.37s/it]

Title: How Abstract Is Linguistic Generalization in Large Language Models? Experiments with Argument Structure
Evaluation Result: Does it talk about LLMs: yes.
Rate Limitations of LLMs: 4.
Evidence: "However, LLMs fail at generalizations between related contexts that have not been observed during pre-training, but which instantiate more abstract, but well-attested structural generalizations... This finding points to a limitation with current models and points to a reason for which their training is data-intensive."



Processing papers:  93%|██████████████████████████████████████████████████████████▋    | 27/29 [00:38<00:02,  1.29s/it]

Title: Multi 3 WOZ: A Multilingual, Multi-Domain, Multi-Parallel Dataset for Training and Evaluating Culturally Adapted Task-Oriented Dialog Systems
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models or their limitations. It focuses on the creation and evaluation of a multilingual, multi-domain, multi-parallel task-oriented dialog dataset.



Processing papers:  97%|████████████████████████████████████████████████████████████▊  | 28/29 [00:39<00:01,  1.24s/it]

Title: Can Authorship Representation Learning Capture Stylistic Features?
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on authorship representation learning and its ability to capture stylistic features. It does not mention or discuss large language models (LLMs) or their limitations.



Processing papers: 100%|███████████████████████████████████████████████████████████████| 29/29 [00:40<00:00,  1.41s/it]

Title: Optimal Transport Posterior Alignment for Cross-lingual Semantic Parsing
Evaluation Result: Does it talk about LLMs: no.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on cross-lingual semantic parsing and methods to improve performance using Optimal Transport. It does not mention large language models or their limitations.






### Prompt4: Selective Few-Shot Prompting Technique

In [22]:
import openai
import json
import csv
from tqdm import tqdm

openai.api_key = 'Enter API key here'

def evaluate_paper_few_shot(title, summary):
    few_shot_examples = """
    ### Example Evaluations:

    **Example 1:**
    Title: SPAE Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs
    Paper: "In this work, we introduce Semantic Pyramid AutoEncoder (SPAE) for enabling frozen LLMs to perform both understanding and generation tasks involving non-linguistic modalities such as images or videos. SPAE converts between raw pixels and interpretable lexical tokens (or words) extracted from the LLM's vocabulary. The resulting tokens capture both the semantic meaning and the fine-grained details needed for visual reconstruction, effectively translating the visual content into a language comprehensible to the LLM, and empowering it to perform a wide array of multimodal tasks. Our approach is validated through in-context learning experiments with frozen PaLM 2 and GPT 3.5 on a diverse set of image understanding and generation tasks. Our method marks the first successful attempt to enable a frozen LLM to generate image content while surpassing state-of-the-art performance in image understanding tasks, under the same setting, by over 25%"
    Does it talk about LLMs: Yes.
    Rate Limitations of LLMs: 1.
    Evidence: "In this work, we introduce Semantic Pyramid AutoEncoder (SPAE) for enabling frozen LLMs to perform both understanding and generation tasks involving non-linguistic modalities such as images or videos."

    **Example 2:**
    Title: Large Language Models for Conducting Advanced Text Analytics Information Systems Research
    Paper: "The exponential growth of digital content has generated massive textual datasets, necessitating advanced analytical approaches. Large Language Models (LLMs) have emerged as tools capable of processing and extracting insights from massive unstructured textual datasets. However, how to leverage LLMs for text-based Information Systems (IS) research is currently unclear. To assist IS research in understanding how to operationalize LLMs, we propose a Text Analytics for Information Systems Research (TAISR) framework. Our proposed framework provides detailed recommendations grounded in IS and LLM literature on how to conduct meaningful text-based IS research. We conducted three case studies in business intelligence using our TAISR framework to demonstrate its application across several IS research contexts. We also outline potential challenges and limitations in adopting LLMs for IS. By offering a systematic approach and evidence of its utility, our TAISR framework contributes to future IS research streams looking to incorporate powerful LLMs for text analytics."
    Does it talk about LLMs: Yes.
    Rate Limitations of LLMs: 2.
    Evidence: "We also outline potential challenges and limitations in adopting LLMs for IS."
    
    **Example 3:**
    Title: Meta-Reasoning: Semantics-Symbol Deconstruction for Large Language Models
    Paper: "Neural-symbolic methods have demonstrated efficiency in enhancing the reasoning abilities of large language models (LLMs). However, existing methods mainly rely on syntactically mapping natural languages to complete formal languages like Python and SQL. Those methods require that reasoning tasks be convertible into programs, which cater to the computer execution mindset and deviate from human reasoning habits. To broaden symbolic methods' applicability and adaptability in the real world, we propose the Meta-Reasoning from a linguistic perspective. This method empowers LLMs to deconstruct reasoning-independent semantic information into generic symbolic representations, thereby efficiently capturing more generalized reasoning knowledge. We conduct extensive experiments on more than ten datasets encompassing conventional reasoning tasks like arithmetic, symbolic, and logical reasoning, and the more complex interactive reasoning tasks like theory-of-mind reasoning. Experimental results demonstrate that Meta-Reasoning significantly enhances in-context reasoning accuracy, learning efficiency, out-of-domain generalization, and output stability compared to the Chain-of-Thought technique. Code and data are publicly available at \\url{https://github.com/Alsace08/Meta-Reasoning}."
    Does it talk about LLMs: Yes.
    Rate Limitations of LLMs: 3.
    Evidence: "existing methods mainly rely on syntactically mapping natural languages to complete formal languages like Python and SQL. Those methods require that reasoning tasks be convertible into programs, which cater to the computer execution mindset and deviate from human reasoning habits."
   
    **Example 4:**
    Title: Fairness in Large Language Models: A Taxonomic Survey
    Paper: "Large Language Models (LLMs) have demonstrated remarkable success across various domains. However, despite their promising performance in numerous real-world applications, most of these algorithms lack fairness considerations. Consequently, they may lead to discriminatory outcomes against certain communities, particularly marginalized populations, prompting extensive study in fair LLMs. On the other hand, fairness in LLMs, in contrast to fairness in traditional machine learning, entails exclusive backgrounds, taxonomies, and fulfillment techniques. To this end, this survey presents a comprehensive overview of recent advances in the existing literature concerning fair LLMs. Specifically, a brief introduction to LLMs is provided, followed by an analysis of factors contributing to bias in LLMs. Additionally, the concept of fairness in LLMs is discussed categorically, summarizing metrics for evaluating bias in LLMs and existing algorithms for promoting fairness. Furthermore, resources for evaluating bias in LLMs, including toolkits and datasets, are summarized. Finally, existing research challenges and open questions are discussed."
    Does it talk about LLMs: Yes.
    Rate Limitations of LLMs: 4.
    Evidence: "most of these algorithms lack fairness considerations. Consequently, they may lead to discriminatory outcomes against certain communities, particularly marginalized populations," and "an analysis of factors contributing to bias in LLMs. Additionally, the concept of fairness in LLMs is discussed categorically, summarizing metrics for evaluating bias in LLMs and existing algorithms for promoting fairness. Furthermore, resources for evaluating bias in LLMs, including toolkits and datasets, are summarized."
    
    **Example 5:**
    Title: Lost in the Middle: How Language Models Use Long Contexts
    Paper: "While recent language models have the ability to take long contexts as input, relatively little is known about how well they use longer context. We analyze the performance of language models on two tasks that require identifying relevant information in their input contexts: multi-document question answering and key-value retrieval. We find that performance can degrade significantly when changing the position of relevant information, indicating that current language models do not robustly make use of information in long input contexts. In particular, we observe that performance is often highest when relevant information occurs at the beginning or end of the input context, and significantly degrades when models must access relevant information in the middle of long contexts, even for explicitly long-context models. Our analysis provides a better understanding of how language models use their input context and provides new evaluation protocols for future long-context language models."
    Does it talk about LLMs: Yes.
    Rate Limitations of LLMs: 5.
    Evidence: "We find that performance can degrade significantly when changing the position of relevant information, indicating that current language models do not robustly make use of information in long input contexts."
    
    """
    
    prompt = f"""
    Please evaluate the following paper to determine if it discusses language models (e.g., LMs or LLMs) and whether it addresses their limitations. If it does, indicate the relevant parts of the abstract or title. Note that LMs and LLMs include pre-trained transformer-based language models and multimodal, visual language models. Include all kinds of language models but exclude other, more general models.

    Please rate the papers from 1 to 5:
    - **1:** The abstract does not talk about large language models at all, or even if it talks about LLMs, it does not mention any limitation of them.
    - **2-3:** The abstract mentions just a few limitations of Large Language Models; they are mentioned as secondary points.
    - **4-5:** The abstract explicitly talks a lot about the limitations of Large Language Models and discusses them in detail or it uses strong wording.

    {few_shot_examples}
    
    Please answer in the following format by providing the rating and a brief evidence for each abstract.

    Answer in the following way:
    Does it talk about LLMs: [yes/no].
    Rate Limitations of LLMs: [1-5].
    Evidence: [the evidence text in the abstract or title].

    Title: {title}
    Paper: {summary}
    """

    response = openai.ChatCompletion.create(
        model="gpt-4o-2024-05-13",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        n=1,
        stop=None,
        temperature=0.5
    )

    return response.choices[0]['message']['content'].strip()

def read_papers_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        papers = json.load(file)
    return papers

file_path = '10_gold_standard_papers.json'

csv_file_path = 'few_shot_prompt_10_results.csv'
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Talks about LLMs', 'Rate', 'Evidence'])

    papers = read_papers_from_json(file_path)

    for paper in tqdm(papers, desc="Processing papers"):
        title = paper.get('title', 'No Title')
        summary = paper.get('summary', 'No Summary')
        evaluation_result = evaluate_paper_few_shot(title, summary)

        # parsing the response
        lines = evaluation_result.split('\n')
        talks_about_llms = ''
        rate = ''
        evidence = ''
        
        for line in lines:
            if line.startswith('Does it talk about LLMs:'):
                talks_about_llms = line.split(':')[1].strip().strip('[]')
            elif line.startswith('Rate Limitations of LLMs:'):
                rate = line.split(':')[1].strip().strip('[]')
            elif line.startswith('Evidence:'):
                evidence = line.split(':')[1].strip().strip('[]')

        # results written on csv file 
        writer.writerow([title, talks_about_llms, rate, evidence])
        print(f"Title: {title}\nEvaluation Result: {evaluation_result}\n")



Processing papers:   3%|██▏                                                             | 1/29 [00:01<00:46,  1.66s/it]

Title: Directed Acyclic Transformer Pre-training for High-quality Non-autoregressive Text Generation
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on Non-AutoRegressive (NAR) text generation models and their pre-training, without mentioning large language models or their limitations.



Processing papers:   7%|████▍                                                           | 2/29 [00:05<01:12,  2.68s/it]

Title: Time-and-Space-Efficient Weighted Deduction
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses weighted deduction strategies and their efficiency in terms of time and space. There is no mention of language models, large language models, or their limitations.



Processing papers:  10%|██████▌                                                         | 3/29 [00:07<01:05,  2.53s/it]

Title: Conditional Generation with a Question-Answering Blueprint
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses conditional generation, neural seq-to-seq models, and the use of question-answer pairs as intermediate representations for planning in text generation. It does not mention large language models (LLMs) or their limitations.



Processing papers:  14%|████████▊                                                       | 4/29 [00:10<01:09,  2.76s/it]

Title: Collective Human Opinions in Semantic Textual Similarity
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses semantic textual similarity (STS) and the limitations of current STS models in capturing human disagreement, but it does not mention large language models (LLMs) or any limitations specific to them.



Processing papers:  17%|███████████                                                     | 5/29 [00:11<00:54,  2.29s/it]

Title: Design Choices for Crowdsourcing Implicit Discourse Relations: Revealing the Biases Introduced by Task Design
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses biases introduced by task design in crowdsourcing linguistic annotations and does not mention large language models (LLMs) or their limitations.



Processing papers:  21%|█████████████▏                                                  | 6/29 [00:13<00:49,  2.14s/it]

Title: Communication Drives the Emergence of Language Universals in Neural Agents: Evidence from the Word-order/Case-marking Trade-off
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses neural agent-based simulations of language emergence and change, focusing on the word-order/case-marking trade-off and a new framework for neural-agent language learning and communication. It does not mention large language models (LLMs) or their limitations.



Processing papers:  24%|███████████████▍                                                | 7/29 [00:15<00:44,  2.02s/it]

Title: A Cross-Linguistic Pressure for Uniform Information Density in Word Order
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses cross-linguistic statistical patterns and the uniform information density hypothesis in natural languages, without any mention of large language models or their limitations.



Processing papers:  28%|█████████████████▋                                              | 8/29 [00:17<00:40,  1.95s/it]

Title: Cross-functional Analysis of Generalization in Behavioral Learning
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses behavioral learning, generalization, and evaluation methods for NLP tasks like sentiment analysis, paraphrase identification, and reading comprehension. It does not mention large language models (LLMs) or their limitations.



Processing papers:  31%|███████████████████▊                                            | 9/29 [00:19<00:41,  2.09s/it]

Title: Exploring Contrast Consistency of Open-Domain Question Answering Systems on Minimally Edited Questions
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Our collection approach combines both human annotation and large language model generation." and "We find that the widely used dense passage retriever (DPR) performs poorly on our contrast sets, despite fitting the training set well and performing competitively on standard test sets."

The abstract mentions the use of large language models for generating minimally edited questions and points out some limitations related to the performance of dense



Processing papers:  34%|█████████████████████▋                                         | 10/29 [00:21<00:37,  1.98s/it]

Title: Compositional Zero-Shot Domain Transfer with Text-to-Text Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We propose a novel compositional transfer learning framework (DoT51) for zero-shot domain transfer." The abstract primarily discusses the proposed framework and its effectiveness without mentioning limitations of large language models.



Processing papers:  38%|███████████████████████▉                                       | 11/29 [00:23<00:34,  1.93s/it]

Title: MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages
Evaluation Result: **Title: MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages**

Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on the creation and features of the MIRACL multilingual dataset for ad hoc retrieval tasks, without mentioning language models or their limitations.



Processing papers:  41%|██████████████████████████                                     | 12/29 [00:24<00:30,  1.82s/it]

Title: DMDD: A Large-Scale Dataset for Dataset Mentions Detection
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models or their limitations. It focuses on dataset mention detection in scientific literature and introduces a new corpus for this task.



Processing papers:  45%|████████████████████████████▏                                  | 13/29 [00:26<00:28,  1.76s/it]

Title: T3L: Translate-and-Test Transfer Learning for Cross-Lingual Text Classification
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, the performance of these models varies significantly across languages and classification tasks, suggesting that the superposition of the language modelling and classification tasks is not always effective."



Processing papers:  48%|██████████████████████████████▍                                | 14/29 [00:28<00:26,  1.75s/it]

Title: Introduction to Mathematical Language Processing: Informal Proofs, Word Problems, and Supporting Tasks
Evaluation Result: **Does it talk about LLMs:** No.
**Rate Limitations of LLMs:** 1.
**Evidence:** The abstract discusses mathematical language processing methods and their applications in automating discovery in mathematics and science. It does not mention large language models (LLMs) or their limitations.



Processing papers:  52%|████████████████████████████████▌                              | 15/29 [00:30<00:24,  1.76s/it]

Title: Evaluating a Century of Progress on the Cognitive Science of Adjective Ordering
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on the cognitive science of adjective ordering and the use of natural language processing technologies and datasets to evaluate proposals across multiple languages. It does not mention large language models or their limitations.



Processing papers:  55%|██████████████████████████████████▊                            | 16/29 [00:31<00:21,  1.62s/it]

Title: Improving Multitask Retrieval by Promoting Task Specialization
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on multitask retrieval and does not mention language models (LLMs or LMs) or their limitations.



Processing papers:  59%|████████████████████████████████████▉                          | 17/29 [00:33<00:21,  1.76s/it]

Title: Calibrated Interpretation: Confidence Estimation in Semantic Parsing
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses sequence generation models used for semantic parsing and focuses on calibration and confidence estimation in these models. It does not mention large language models (LLMs) or their limitations.



Processing papers:  62%|███████████████████████████████████████                        | 18/29 [00:35<00:19,  1.81s/it]

Title: Intent-calibrated Self-training for Answer Selection in Open-domain Dialogues
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models (LLMs) or their limitations. It focuses on the intent-calibrated self-training (ICAST) method for answer selection in open-domain dialogues.



Processing papers:  66%|█████████████████████████████████████████▎                     | 19/29 [00:36<00:17,  1.73s/it]

Title: Benchmarking the Generation of Fact Checking Explanations
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models (LLMs) or any limitations associated with them. It focuses on fact-checking and summarization approaches.



Processing papers:  69%|███████████████████████████████████████████▍                   | 20/29 [00:38<00:15,  1.73s/it]

Title: T 2 -NER: A Two-Stage Span-Based Framework for Unified Named Entity Recognition with Templates
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention large language models (LLMs) or their limitations. It focuses on a two-stage span-based framework for Named Entity Recognition (NER).



Processing papers:  72%|█████████████████████████████████████████████▌                 | 21/29 [00:40<00:14,  1.76s/it]

Title: PASTA: A Dataset for Modeling PArticipant STAtes in Narratives
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Experiments show that today’s LLMs can reason about states to some degree, but there is large room for improvement, especially in problems requiring access and ability to reason with diverse types of knowledge (e.g., physical, numerical, factual)."



Processing papers:  76%|███████████████████████████████████████████████▊               | 22/29 [00:42<00:13,  1.88s/it]

Title: U-CORE: A Unified Deep Cluster-wise Contrastive Framework for Open Relation Extraction
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract focuses on Open Relation Extraction (ORE) tasks and introduces a framework called U-CORE for Zero-shot and Unsupervised ORE. There is no mention of large language models (LLMs) or their limitations.



Processing papers:  79%|█████████████████████████████████████████████████▉             | 23/29 [00:44<00:11,  1.84s/it]

Title: In-Context Retrieval-Augmented Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "In addition, they can mitigate the problem of factually inaccurate text generation and provide natural source attribution mechanism." and "Existing RALM approaches focus on modifying the LM architecture in order to facilitate the incorporation of external information, significantly complicating deployment."



Processing papers:  83%|████████████████████████████████████████████████████▏          | 24/29 [00:45<00:08,  1.72s/it]

Title: Learning to Paraphrase Sentences to Different Complexity Levels
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Finally, we establish how a handful of Large Language Models perform on these tasks under a zero-shot setting."



Processing papers:  86%|██████████████████████████████████████████████████████▎        | 25/29 [00:47<00:07,  1.76s/it]

Title: Direct Speech Translation for Automatic Subtitling
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses a direct speech translation model for automatic subtitling, focusing on the generation of subtitles and timestamps. It does not mention large language models (LLMs) or their limitations.



Processing papers:  90%|████████████████████████████████████████████████████████▍      | 26/29 [00:50<00:05,  1.94s/it]

Title: How Abstract Is Linguistic Generalization in Large Language Models? Experiments with Argument Structure
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, LLMs fail at generalizations between related contexts that have not been observed during pre-training, but which instantiate more abstract, but well-attested structural generalizations (e.g., between the active object and passive subject of an arbitrary verb). Instead, in this case, LLMs show a bias to generalize based on linear order. This finding points to a limitation



Processing papers:  93%|██████████████████████████████████████████████████████████▋    | 27/29 [00:53<00:04,  2.28s/it]

Title: Multi 3 WOZ: A Multilingual, Multi-Domain, Multi-Parallel Dataset for Training and Evaluating Culturally Adapted Task-Oriented Dialog Systems
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses the creation and evaluation of a multilingual, multi-domain, multi-parallel dataset for task-oriented dialog systems. It does not mention large language models (LLMs) or their limitations.



Processing papers:  93%|██████████████████████████████████████████████████████████▋    | 27/29 [00:53<00:03,  1.97s/it]


RateLimitError: Rate limit reached for gpt-4o in organization org-cIU00Ks8lIudySaVYr0LQOp3 on tokens per min (TPM): Limit 30000, Used 27740, Requested 2704. Please try again in 888ms. Visit https://platform.openai.com/account/rate-limits to learn more.

### Prompt5: Detailed Explanatory Few-Shot Prompting

In [5]:
import openai
import json
import csv
import os
from tqdm import tqdm
import time

#openai.api_key = 'Enter your API key here' #my api 
openai.api_key = 'Enter your API key here' #---> 2nd API
# Keep track of progress with an index
progress_file = 'progress_few_shot.txt'

def evaluate_paper_few_shot(title, summary):
    prompt = f"""
    Please evaluate the following paper to determine if it discusses language models (e.g., LMs or LLMs) and whether it addresses their limitations. If it does, indicate the relevant parts of the abstract or title. Note that LMs and LLMs include pre-trained transformer-based language models and multimodal, visual language models. Include all kinds of language models but exclude other, more general models.
    Please look at the following examples alongside the explanations on why decided the respective ratings and rate the other abstracts from 1 to 5 accordingly by following the same logic as below: 
    
    **Example Output 1:**
    Title: SPAE Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs
    Paper: "In this work, we introduce Semantic Pyramid AutoEncoder (SPAE) for enabling frozen LLMs to perform both understanding and generation tasks involving non-linguistic modalities such as images or videos. SPAE converts between raw pixels and interpretable lexical tokens (or words) extracted from the LLM's vocabulary. The resulting tokens capture both the semantic meaning and the fine-grained details needed for visual reconstruction, effectively translating the visual content into a language comprehensible to the LLM, and empowering it to perform a wide array of multimodal tasks. Our approach is validated through in-context learning experiments with frozen PaLM 2 and GPT 3.5 on a diverse set of image understanding and generation tasks. Our method marks the first successful attempt to enable a frozen LLM to generate image content while surpassing state-of-the-art performance in image understanding tasks, under the same setting, by over 25%"
    Does it talk about LLMs: Yes.
    Rate Limitations of LLMs: 1.
    Evidence: "In this work, we introduce Semantic Pyramid AutoEncoder (SPAE) for enabling frozen LLMs to perform both understanding and generation tasks involving non-linguistic modalities such as images or videos."
    
    Output Explanation: This paper should be rated with 1 since even though it talks about LLMs, it does not mention any explicit limitation of the models in the abstract. Note: Additionally, papers that do not talk about LLMs at all, rate them with 1. 
    
    **Example Output 2:**
    Title: Large Language Models for Conducting Advanced Text Analytics Information Systems Research
    Paper: "The exponential growth of digital content has generated massive textual datasets, necessitating advanced analytical approaches. Large Language Models (LLMs) have emerged as tools capable of processing and extracting insights from massive unstructured textual datasets. However, how to leverage LLMs for text-based Information Systems (IS) research is currently unclear. To assist IS research in understanding how to operationalize LLMs, we propose a Text Analytics for Information Systems Research (TAISR) framework. Our proposed framework provides detailed recommendations grounded in IS and LLM literature on how to conduct meaningful text-based IS research. We conducted three case studies in business intelligence using our TAISR framework to demonstrate its application across several IS research contexts. We also outline potential challenges and limitations in adopting LLMs for IS. By offering a systematic approach and evidence of its utility, our TAISR framework contributes to future IS research streams looking to incorporate powerful LLMs for text analytics."
    Does it talk about LLMs: Yes.
    Rate Limitations of LLMs: 2.
    Evidence: "We also outline potential challenges and limitations in adopting LLMs for IS."
    
    Output Explanation: This abstract mentions just one limitation of the Large Language Models and focuses on other topics.
    
    **Example Output 3:**
    Title: Meta-Reasoning: Semantics-Symbol Deconstruction for Large Language Models
    Paper: "Neural-symbolic methods have demonstrated efficiency in enhancing the reasoning abilities of large language models (LLMs). However, existing methods mainly rely on syntactically mapping natural languages to complete formal languages like Python and SQL. Those methods require that reasoning tasks be convertible into programs, which cater to the computer execution mindset and deviate from human reasoning habits. To broaden symbolic methods' applicability and adaptability in the real world, we propose the Meta-Reasoning from a linguistic perspective. This method empowers LLMs to deconstruct reasoning-independent semantic information into generic symbolic representations, thereby efficiently capturing more generalized reasoning knowledge. We conduct extensive experiments on more than ten datasets encompassing conventional reasoning tasks like arithmetic, symbolic, and logical reasoning, and the more complex interactive reasoning tasks like theory-of-mind reasoning. Experimental results demonstrate that Meta-Reasoning significantly enhances in-context reasoning accuracy, learning efficiency, out-of-domain generalization, and output stability compared to the Chain-of-Thought technique. Code and data are publicly available at: ."
    Does it talk about LLMs: Yes.
    Rate Limitations of LLMs: 3.
    Evidence: "existing methods mainly rely on syntactically mapping natural languages to complete formal languages like Python and SQL. Those methods require that reasoning tasks be convertible into programs, which cater to the computer execution mindset and deviate from human reasoning habits."
    
    Output Explanation: This abstract mentions few limitations but not in great detail or as the main focus.   
    
    **Example Output 4:**
    Title: Fairness in Large Language Models: A Taxonomic Survey
    Paper: "Large Language Models (LLMs) have demonstrated remarkable success across various domains. However, despite their promising performance in numerous real-world applications, most of these algorithms lack fairness considerations. Consequently, they may lead to discriminatory outcomes against certain communities, particularly marginalized populations, prompting extensive study in fair LLMs. On the other hand, fairness in LLMs, in contrast to fairness in traditional machine learning, entails exclusive backgrounds, taxonomies, and fulfillment techniques. To this end, this survey presents a comprehensive overview of recent advances in the existing literature concerning fair LLMs. Specifically, a brief introduction to LLMs is provided, followed by an analysis of factors contributing to bias in LLMs. Additionally, the concept of fairness in LLMs is discussed categorically, summarizing metrics for evaluating bias in LLMs and existing algorithms for promoting fairness. Furthermore, resources for evaluating bias in LLMs, including toolkits and datasets, are summarized. Finally, existing research challenges and open questions are discussed."
    Does it talk about LLMs: Yes.
    Rate Limitations of LLMs: 4.
    Evidence: "most of these algorithms lack fairness considerations. Consequently, they may lead to discriminatory outcomes against certain communities, particularly marginalized populations," and "an analysis of factors contributing to bias in LLMs. Additionally, the concept of fairness in LLMs is discussed categorically, summarizing metrics for evaluating bias in LLMs and existing algorithms for promoting fairness. Furthermore, resources for evaluating bias in LLMs, including toolkits and datasets, are summarized."
    
    Output Explanation: The paper mentions several limitations related to fairness and bias in LLMs. It discusses these limitations in detail and they are significant, but are discussed alongside other topic.
    
    **Example Output 5:**
    Title: Lost in the Middle: How Language Models Use Long Contexts
    Paper: "While recent language models have the ability to take long contexts as input, relatively little is known about how well they use longer context. We analyze the performance of language models on two tasks that require identifying relevant information in their input contexts: multi-document question answering and key-value retrieval. We find that performance can degrade significantly when changing the position of relevant information, indicating that current language models do not robustly make use of information in long input contexts. In particular, we observe that performance is often highest when relevant information occurs at the beginning or end of the input context, and significantly degrades when models must access relevant information in the middle of long contexts, even for explicitly long-context models. Our analysis provides a better understanding of how language models use their input context and provides new evaluation protocols for future long-context language models."
    Does it talk about LLMs: Yes.
    Rate Limitations of LLMs: 5.
    Evidence: "We find that performance can degrade significantly when changing the position of relevant information, indicating that current language models do not robustly make use of information in long input contexts."
    
    Output Explanation: The entire abstract focuses on the limitations and challenges associated with LLMs' ability to handle long contexts. The whole abstract represents a detailed discussion of a critical limitation

    Please answer in the following format by providing the rating and a brief evidence for each abstract. Please do not give the respective Explanations, only the evidence found in the abstract. 

    Answer in the following way:
    Does it talk about LLMs: [yes/no].
    Rate Limitations of LLMs: [1-5].
    Evidence: [the evidence text in the abstract or title].

    Title: {title}
    Paper: {summary}
    """

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-2024-05-13",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=100,
            n=1,
            stop=None,
            temperature=0.5
        )
        return response.choices[0]['message']['content'].strip()
    except openai.error.RateLimitError as e:
        print(f"Rate limit error: {e}")
        raise
    except Exception as e:
        print(f"Error during API call: {e}")
        return None

def read_papers_from_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            papers = json.load(file)
        print(f"Successfully read {len(papers)} papers from the JSON file.")
        return papers
    except Exception as e:
        print(f"Error reading JSON file: {e}")
        return []

def get_last_processed_index():
    if os.path.exists(progress_file):
        try:
            with open(progress_file, 'r') as file:
                return int(file.read().strip())
        except Exception as e:
            print(f"Error reading progress file: {e}")
            return -1
    return -1

def save_progress(index):
    try:
        with open(progress_file, 'w') as file:
            file.write(str(index))
        print(f"Progress saved at index: {index}")
    except Exception as e:
        print(f"Error saving progress: {e}")
file_path = 'arXiv_classification_yes_no_results/arXiv_01Feb_to_31Mar_2024_LLM_limitations_part5.json'
csv_file_path = 'arXiv_limitation_ratings/Feb_Mar_2024_ratings_part5.csv'
last_index = get_last_processed_index()
print(f"Last processed index: {last_index}")

papers = read_papers_from_json(file_path)

if not papers:
    print("No papers to process. Exiting.")
    exit()
if last_index >= len(papers):
    last_index = -1

with open(csv_file_path, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    if last_index == -1:
        writer.writerow(['Title', 'Talks about LLMs', 'Rate', 'Evidence', 'Published'])
        print("Header written to CSV file.")

    for index, paper in enumerate(tqdm(papers, desc="Processing papers"), start=0):
        if index <= last_index:
            continue

        title = paper.get('title', 'No Title')
        summary = paper.get('summary', 'No Summary')
        published = paper.get('published', 'No Date')  
        
        max_retries = 5
        retries = 0
        delay = 2
        while retries < max_retries:
            try:
                evaluation_result = evaluate_paper_few_shot(title, summary)
                
                if evaluation_result is None:
                    print(f"Skipping paper at index {index} due to API call failure.")
                    break
                lines = evaluation_result.split('\n')
                talks_about_llms = ''
                rate = ''
                evidence = ''
                
                for line in lines:
                    if line.startswith('Does it talk about LLMs:'):
                        talks_about_llms = line.split(':')[1].strip().strip('[]')
                    elif line.startswith('Rate Limitations of LLMs:'):
                        rate = line.split(':')[1].strip().strip('[]')
                    elif line.startswith('Evidence:'):
                        evidence = line.split(':')[1].strip().strip('[]')
                writer.writerow([title, talks_about_llms, rate, evidence, published])
                print(f"Title: {title}\nEvaluation Result: {evaluation_result}\n")

                save_progress(index)

                time.sleep(1) 
                break
            except openai.error.RateLimitError:
                retries += 1
                print(f"Rate limit error. Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2 
                continue
            except Exception as e:
                print(f"An error occurred at index {index}: {e}")
                break

print("Processing complete.")


Error reading progress file: invalid literal for int() with base 10: ''
Last processed index: -1
Successfully read 511 papers from the JSON file.
Header written to CSV file.


Processing papers:   0%|                                                                       | 0/511 [00:00<?, ?it/s]

Title: Generalization in Healthcare AI: Evaluation of a Clinical Large Language Model
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "the potential of these models importantly depends on their ability to generalize effectively across clinical environments and populations, a challenge often underestimated in early development." and "We found poorer generalization particularly in hospitals with fewer samples, among patients with government and unspecified insurance, the elderly, and those with high comorbidities."

Progress saved at index: 0


Processing papers:   0%|                                                               | 1/511 [00:02<23:50,  2.80s/it]

Title: Open-Vocabulary Segmentation with Unpaired Mask-Text Supervision
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "we advocate using the large vision-language model (LVLM) to refine text descriptions and devise a multi-scale ensemble to stablise the matching between masks and entities."

Progress saved at index: 1


Processing papers:   0%|▏                                                              | 2/511 [00:05<21:13,  2.50s/it]

Title: MUSTARD: Mastering Uniform Synthesis of Theorem and Proof Data
Evaluation Result: Title: MUSTARD: Mastering Uniform Synthesis of Theorem and Proof Data
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "As these two tasks require strict and formal multi-step inference, they are appealing domains for exploring the reasoning ability of LLMs but still face important challenges."

Progress saved at index: 2


Processing papers:   1%|▎                                                              | 3/511 [00:07<22:13,  2.63s/it]

Title: Using Counterfactual Tasks to Evaluate the Generality of Analogical Reasoning in Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "This work provides evidence that, despite previously reported successes of LLMs on analogical reasoning, these models lack the robustness and generality of human analogy-making."

Progress saved at index: 3


Processing papers:   1%|▍                                                              | 4/511 [00:10<21:09,  2.50s/it]

Title: Premise Order Matters in Reasoning with Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "we discover a frailty: LLMs are surprisingly brittle to the ordering of the premises, despite the fact that such ordering does not alter the underlying task."

Progress saved at index: 4


Processing papers:   1%|▌                                                              | 5/511 [00:12<21:20,  2.53s/it]

Title: MaxMin-RLHF: Towards Equitable Alignment of Large Language Models with Diverse Human Preferences
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "we first derive an impossibility result of alignment with single reward RLHF, thereby highlighting its insufficiency in representing diverse human preferences."

Progress saved at index: 5


Processing papers:   1%|▋                                                              | 6/511 [00:15<21:06,  2.51s/it]

Title: Tree-Based Hard Attention with Self-Motivation for Large Language Models
Evaluation Result: Title: Tree-Based Hard Attention with Self-Motivation for Large Language Models

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "While large language models (LLMs) excel at understanding and generating plain text, they are not specifically tailored to handle hierarchical text structures. Extracting the task-desired property from their natural language responses typically necessitates additional processing steps."

Progress saved at index: 6


Processing papers:   1%|▊                                                              | 7/511 [00:18<23:31,  2.80s/it]

Title: Large Language Model with Graph Convolution for Recommendation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "existing ways of prompting LLMs with raw texts ignore structured knowledge of user-item interactions, which may lead to hallucination problems like inconsistent description generation."

Progress saved at index: 7


Processing papers:   2%|▉                                                              | 8/511 [00:21<23:47,  2.84s/it]

Title: GhostWriter: Augmenting Collaborative Human-AI Writing Experiences Through Personalization and Agency
Evaluation Result: Title: GhostWriter: Augmenting Collaborative Human-AI Writing Experiences Through Personalization and Agency

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "However, LLM-powered writing systems can frustrate users due to their limited personalization and control, which can be exacerbated when users lack experience with prompt engineering."

Progress saved at index: 8


Processing papers:   2%|█                                                              | 9/511 [00:24<23:30,  2.81s/it]

Title: eCeLLM: Generalizing Large Language Models for E-commerce from Large-scale, High-quality Instruction Data
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "large language models (LLMs) demonstrate outstanding performance in generalist modeling and out-of-domain generalizability in many fields."

Progress saved at index: 9


Processing papers:   2%|█▏                                                            | 10/511 [00:26<21:58,  2.63s/it]

Title: Combining Insights From Multiple Large Language Models Improves Diagnostic Accuracy
Evaluation Result: Title: Combining Insights From Multiple Large Language Models Improves Diagnostic Accuracy

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, even LLMs specifically trained on medical topics may lack sufficient diagnostic accuracy for real-life applications."

Progress saved at index: 10


Processing papers:   2%|█▎                                                            | 11/511 [00:29<21:59,  2.64s/it]

Title: ChatGPT vs LLaMA: Impact, Reliability, and Challenges in Stack Overflow Discussions
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "identify and understand why LLMs fails" and "ChatGPT and LLaMA challenge human expertise, yet do not outperform it for some domains."

Progress saved at index: 11


Processing papers:   2%|█▍                                                            | 12/511 [00:32<22:49,  2.74s/it]

Title: Rethinking Machine Unlearning for Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We highlight the often-overlooked aspects of existing LLM unlearning research, e.g., unlearning scope, data-model interaction, and multifaceted efficacy assessment."

Progress saved at index: 12


Processing papers:   3%|█▌                                                            | 13/511 [00:34<21:49,  2.63s/it]

Title: GLoRe: When, Where, and How to Improve LLM Reasoning via Global and Local Refinements
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, recent work demonstrates that even the best models struggle to identify when and where to refine without access to external feedback." and "But they are expensive to train, requiring extensive human annotations."

Progress saved at index: 13


Processing papers:   3%|█▋                                                            | 14/511 [00:37<21:59,  2.66s/it]

Title: Measuring and Controlling Instruction (In)Stability in Language Model Dialogs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Testing popular models like LLaMA2-chat-70B and GPT-3.5, we reveal a significant instruction drift within eight rounds of conversations."

Progress saved at index: 14


Processing papers:   3%|█▊                                                            | 15/511 [00:39<21:12,  2.57s/it]

Title: JAMDEC: Unsupervised Authorship Obfuscation using Constrained Decoding over Small Language Models
Evaluation Result: Title: JAMDEC: Unsupervised Authorship Obfuscation using Constrained Decoding over Small Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Our approach builds on small language models such as GPT2-XL in order to help avoid disclosing the original content to proprietary LLM's APIs, while also reducing the performance gap between small and large language models via algorithmic enhancement."

Progress saved at index: 15


Processing papers:   3%|█▉                                                            | 16/511 [00:42<22:27,  2.72s/it]

Title: LLM-driven Imitation of Subrational Behavior : Illusion or Reality?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We conclude by discussing the potential benefits, challenges and limitations of our framework."

Progress saved at index: 16


Processing papers:   3%|██                                                            | 17/511 [00:44<20:59,  2.55s/it]

Title: Mitigating Object Hallucination in Large Vision-Language Models via Classifier-Free Guidance
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "highlighted the critical issue of their tendency to hallucinate non-existing objects in the images" and "these approaches require either expensive training/fine-tuning or API access to advanced LLMs to correct the model's output post-generation."

Progress saved at index: 17


Processing papers:   4%|██▏                                                           | 18/511 [00:48<23:17,  2.84s/it]

Title: COLD-Attack: Jailbreaking LLMs with Stealthiness and Controllability
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Jailbreaks on Large language models (LLMs) have recently received increasing attention... Our extensive experiments on various LLMs (Llama-2, Mistral, Vicuna, Guanaco, GPT-3.5) show COLD-Attack's broad applicability, strong controllability, high success rate, and attack transferability."

Progress saved at index: 18


Processing papers:   4%|██▎                                                           | 19/511 [00:51<24:18,  2.96s/it]

Title: Human Curriculum Effects Emerge with In-Context Learning in Neural Networks
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Here we show that this same tradeoff spontaneously emerges with 'in-context learning' (ICL) both in neural networks trained with metalearning and in large language models (LLMs)."

Progress saved at index: 19


Processing papers:   4%|██▍                                                           | 20/511 [00:53<22:32,  2.75s/it]

Title: Improving Generalization in Semantic Parsing by Increasing Natural Language Variation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, it has also been shown that these models often struggle to generalize even when faced with small perturbations of previously (accurately) parsed expressions."

Progress saved at index: 20


Processing papers:   4%|██▌                                                           | 21/511 [00:56<21:29,  2.63s/it]

Title: The Last JITAI? The Unreasonable Effectiveness of Large Language Models in Issuing Just-in-Time Adaptive Interventions: Fostering Physical Activity in a Prospective Cardiac Rehabilitation Setting
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We explored the viability of Large Language Models (LLMs) for triggering and personalizing content for Just-in-Time Adaptive Interventions (JITAIs) in digital health."

Progress saved at index: 21


Processing papers:   4%|██▋                                                           | 22/511 [00:58<20:41,  2.54s/it]

Title: PIN: Positional Insert Unlocks Object Localisation Abilities in VLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Nevertheless, these models face challenges in the fundamental computer vision task of object localisation, due to their training on multimodal data containing mostly captions without explicit spatial grounding."

Progress saved at index: 22


Processing papers:   5%|██▊                                                           | 23/511 [01:01<20:44,  2.55s/it]

Title: Tandem Transformers for Inference Efficient LLMs
Evaluation Result: Title: Tandem Transformers for Inference Efficient LLMs  
Does it talk about LLMs: Yes.  
Rate Limitations of LLMs: 3.  
Evidence: "The autoregressive nature of conventional large language models (LLMs) inherently limits inference speed, as tokens are generated sequentially. While speculative and parallel decoding techniques attempt to mitigate this, they face limitations: either relying on less accurate smaller models for generation or failing to fully leverage the base LLM's representations."

Progress saved at index: 23


Processing papers:   5%|██▉                                                           | 24/511 [01:03<21:18,  2.63s/it]

Title: SemRel2024: A Collection of Semantic Textual Relatedness Datasets for 14 Languages
Evaluation Result: Title: SemRel2024: A Collection of Semantic Textual Relatedness Datasets for 14 Languages
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "offering insights into the capabilities and performance of Large Language Models (LLMs)."

Progress saved at index: 24


Processing papers:   5%|███                                                           | 25/511 [01:06<21:21,  2.64s/it]

Title: Knowledge Editing on Black-box Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "To tackle privacy leaks of editing data and style over-editing in current methods, we introduce a novel postEdit framework, resolving privacy concerns through downstream post-processing and maintaining textual style consistency via fine-grained editing to original responses."

Progress saved at index: 25


Processing papers:   5%|███▏                                                          | 26/511 [01:09<21:21,  2.64s/it]

Title: PRompt Optimization in Multi-Step Tasks (PROMST): Integrating Human Feedback and Preference Alignment
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "realistic tasks for agents are multi-step and introduce new challenges: (1) Prompt content is likely to be more extensive and complex, making it more difficult for LLMs to analyze errors, (2) the impact of an individual step is difficult to evaluate, and (3) different people may have varied preferences about task execution."

Progress saved at index: 26


Processing papers:   5%|███▎                                                          | 27/511 [01:12<22:02,  2.73s/it]

Title: Test-Time Backdoor Attacks on Multimodal Large Language Models
Evaluation Result: Title: Test-Time Backdoor Attacks on Multimodal Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "In this work, we present AnyDoor, a test-time backdoor attack against multimodal large language models (MLLMs), which involves injecting the backdoor into the textual modality using adversarial test images."

Progress saved at index: 27


Processing papers:   5%|███▍                                                          | 28/511 [01:15<22:13,  2.76s/it]

Title: Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM Agents Exponentially Fast
Evaluation Result: Title: Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM Agents Exponentially Fast

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Nonetheless, red-teaming efforts have revealed that adversarial images/prompts can jailbreak an MLLM and cause unaligned behaviors." and "It entails the adversary simply jailbreaking a single agent, and without any further intervention from the adversary, (almost

Progress saved at index: 28


Processing papers:   6%|███▌                                                          | 29/511 [01:17<22:25,  2.79s/it]

Title: Auditing Counterfire: Evaluating Advanced Counterargument Generation with Evidence and Style
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We audited counter-arguments generated by large language models (LLMs), focusing on their ability to generate evidence-based and stylistic counter-arguments to posts from the Reddit ChangeMyView dataset."

Progress saved at index: 29


Processing papers:   6%|███▋                                                          | 30/511 [01:20<23:07,  2.88s/it]

Title: The Application of ChatGPT in Responding to Questions Related to the Boston Bowel Preparation Scale
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "ChatGPT's accuracy varied between 48.93% and 62.66%, trailing the endoscopists' accuracy of 76.68% to 77.83%. Kappa values for ChatGPT was between 0.52 and 0.53, compared to 0.75 to 0.87 for the endoscopists."

Progress saved at index: 30


Processing papers:   6%|███▊                                                          | 31/511 [01:23<22:42,  2.84s/it]

Title: Lying Blindly: Bypassing ChatGPT's Safeguards to Generate Hard-to-Detect Disinformation Claims at Scale
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "This study explores the capability of ChatGPT to generate unconditioned claims about the war in Ukraine, an event beyond its knowledge cutoff," and "We demonstrate that ChatGPT can produce realistic, target-specific disinformation cheaply, fast, and at scale, and that these claims cannot be reliably distinguished by humans or existing automated tools."

Progress saved at index: 31


Processing papers:   6%|███▉                                                          | 32/511 [01:26<22:55,  2.87s/it]

Title: Large Language Models as Minecraft Agents
Evaluation Result: Title: Large Language Models as Minecraft Agents
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "examining the challenges and opportunities for improvement."

Progress saved at index: 32


Processing papers:   6%|████                                                          | 33/511 [01:29<21:47,  2.74s/it]

Title: Punctuation Restoration Improves Structure Understanding without Supervision
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "despite impressive generative capabilities of recent large language models, their abilities to capture syntactic or semantic structure within text lag behind."

Progress saved at index: 33


Processing papers:   7%|████▏                                                         | 34/511 [01:31<20:58,  2.64s/it]

Title: Unsupervised Evaluation of Code LLMs with Round-Trip Correctness
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "To evaluate code large language models (LLMs), research has relied on a few small manually curated benchmarks, such as HumanEval and MBPP, which represent a narrow part of the real-world software domains."

Progress saved at index: 34


Processing papers:   7%|████▏                                                         | 35/511 [01:34<20:59,  2.65s/it]

Title: Evaluating the Data Model Robustness of Text-to-SQL Systems Based on Real User Queries
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "a systematic exploration of their robustness towards different data models in a real-world, realistic scenario is notably missing" and "we explore the performance of representative Text-to-SQL systems and language models. We further quantify the impact of training data size, pre-, and post-processing steps as well as language model inference time."

Progress saved at index: 35


Processing papers:   7%|████▎                                                         | 36/511 [01:37<22:03,  2.79s/it]

Title: Visually Dehallucinative Instruction Generation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "challenges persist in the hallucination of generative language models, i.e., the generated image-text data contains unintended contents."

Progress saved at index: 36


Processing papers:   7%|████▍                                                         | 37/511 [01:40<22:02,  2.79s/it]

Title: Eliciting Personality Traits in Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, with this comes numerous ethical concerns, particularly related to the lack of transparency in these 'black-box' models."

Progress saved at index: 37


Processing papers:   7%|████▌                                                         | 38/511 [01:42<21:46,  2.76s/it]

Title: Prompted Contextual Vectors for Spear-Phishing Detection
Evaluation Result: Title: Prompted Contextual Vectors for Spear-Phishing Detection

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Spear-phishing attacks present a significant security challenge, with large language models (LLMs) escalating the threat by generating convincing emails and facilitating target reconnaissance."

Progress saved at index: 38


Processing papers:   8%|████▋                                                         | 39/511 [01:45<21:16,  2.70s/it]

Title: ChatCell: Facilitating Single-Cell Analysis with Natural Language
Evaluation Result: Title: ChatCell: Facilitating Single-Cell Analysis with Natural Language
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "High knowledge barriers and limited scalability in current methods restrict the full exploitation of LLMs in mastering single-cell data, impeding direct accessibility and rapid iteration."

Progress saved at index: 39


Processing papers:   8%|████▊                                                         | 40/511 [01:48<22:04,  2.81s/it]

Title: Towards Faithful and Robust LLM Specialists for Evidence-Based Question-Answering
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Evidence-Based QA has proven to work insufficiently with LLMs in terms of citing the correct sources (source quality) and truthfully representing the information within sources (answer attributability)."

Progress saved at index: 40


Processing papers:   8%|████▉                                                         | 41/511 [01:50<21:17,  2.72s/it]

Title: A Survey of Table Reasoning with Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Due to the existing lack of research, questions about which techniques can improve table reasoning performance in the era of LLMs, why LLMs excel at table reasoning, and how to enhance table reasoning abilities in the future, remain largely unexplored. This gap significantly limits progress in research."

Progress saved at index: 41


Processing papers:   8%|█████                                                         | 42/511 [01:53<21:04,  2.70s/it]

Title: BERT4FCA: A Method for Bipartite Link Prediction using Formal Concept Analysis and BERT
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "To address this limitation, we propose an approach using BERT, which can learn more information from the maximal bi-cliques extracted by FCA and use them to make link prediction."

Progress saved at index: 42


Processing papers:   8%|█████▏                                                        | 43/511 [01:55<20:08,  2.58s/it]

Title: Privacy-Preserving Language Model Inference with Instance Obfuscation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Recent studies have started tackling the privacy issue by transforming input data into privacy-preserving representation from the user-end with the techniques such as noise addition and content perturbation, while the exploration of inference result protection, namely decision privacy, is still a blank page."

Progress saved at index: 43


Processing papers:   9%|█████▎                                                        | 44/511 [01:58<19:49,  2.55s/it]

Title: Improving Black-box Robustness with In-Context Rewriting
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We propose LLM-TTA, which uses LLM-generated augmentations as TTA's augmentation function."

Progress saved at index: 44


Processing papers:   9%|█████▍                                                        | 45/511 [02:00<18:46,  2.42s/it]

Title: BBox-Adapter: Lightweight Adapting for Black-Box Large Language Models
Evaluation Result: Title: BBox-Adapter: Lightweight Adapting for Black-Box Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Due to the opacity in their parameters, embeddings, and even output probabilities, existing fine-tuning adaptation methods are inapplicable. Consequently, adapting these black-box LLMs is only possible through their API services, raising concerns about transparency, privacy, and cost."

Progress saved at index: 45


Processing papers:   9%|█████▌                                                        | 46/511 [02:03<20:17,  2.62s/it]

Title: LLaGA: Large Language and Graph Assistant
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, their application to graph data poses distinct challenges due to the inherent difficulty of translating graph structures to language."

Progress saved at index: 46


Processing papers:   9%|█████▋                                                        | 47/511 [02:06<20:00,  2.59s/it]

Title: On Limitations of the Transformer Architecture
Evaluation Result: **Title: On Limitations of the Transformer Architecture**
**Paper:** What are the root causes of hallucinations in large language models (LLMs)? We use Communication Complexity to prove that the Transformer layer is incapable of composing functions (e.g., identify a grandparent of a person in a genealogy) if the domains of the functions are large enough; we show through examples that this inability is already empirically present when the domains are quite small. We also point out that several mathematical tasks that are at

Progress saved at index: 47


Processing papers:   9%|█████▊                                                        | 48/511 [02:09<21:13,  2.75s/it]

Title: Verified Multi-Step Synthesis using Large Language Models and Monte Carlo Tree Search
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We present an approach using Monte Carlo Tree Search (MCTS) to guide Large Language Models (LLMs) to generate verified programs in Dafny, Lean and Coq."

Progress saved at index: 48


Processing papers:  10%|█████▉                                                        | 49/511 [02:11<20:38,  2.68s/it]

Title: On the Resurgence of Recurrent Models for Long Sequences -- Survey and Research Opportunities in the Transformer Era
Evaluation Result: Title: On the Resurgence of Recurrent Models for Long Sequences -- Survey and Research Opportunities in the Transformer Era
Paper: A longstanding challenge for the Machine Learning community is the one of developing models that are capable of processing and learning from very long sequences of data. The outstanding results of Transformers-based networks (e.g., Large Language Models) promotes the idea of parallel attention as the key to succeed in such a challenge, obfuscating the role of classic sequential processing of Recurrent Models. However

Progress saved at index: 49


Processing papers:  10%|██████                                                        | 50/511 [02:15<22:37,  2.95s/it]

Title: On the Self-Verification Limitations of Large Language Models on Reasoning and Planning Tasks
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "While the initial optimism that reasoning might emerge automatically with scale has been tempered thanks to a slew of counterexamples--ranging from multiplication to simple planning--there persists a wide spread belief that LLMs can self-critique and improve their own solutions in an iterative fashion." and "We observe significant performance collapse with self-critique, significant performance gains with sound external verification,

Progress saved at index: 50


Processing papers:  10%|██████▏                                                       | 51/511 [02:18<23:37,  3.08s/it]

Title: Addressing cognitive bias in medical language models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Our analysis revealed varying effects for biases on these LLMs, with GPT-4 standing out for its resilience to bias, in contrast to Llama 2 70B-chat and PMC Llama 13B, which were disproportionately affected by cognitive bias."

Progress saved at index: 51


Processing papers:  10%|██████▎                                                       | 52/511 [02:21<22:18,  2.92s/it]

Title: Relative Preference Optimization: Enhancing LLM Alignment through Contrasting Responses across Identical and Diverse Prompts
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, DPO does not fully reflect the complex nature of human learning, which often involves understanding contrasting responses to not only identical but also similar questions."

Progress saved at index: 52


Processing papers:  10%|██████▍                                                       | 53/511 [02:23<20:32,  2.69s/it]

Title: Investigating the Impact of Data Contamination of Large Language Models in Text-to-SQL Translation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, there is a severe possibility that this translation ability may be influenced by having seen target textual descriptions and the related code. This effect is known as Data Contamination." and "Our results indicate a significant performance drop in GPT-3.5 on the unfamiliar Termite dataset, even with ATD modifications, highlighting the effect of Data Contamination on LLMs in Text-to

Progress saved at index: 53


Processing papers:  11%|██████▌                                                       | 54/511 [02:28<25:50,  3.39s/it]

Title: Grounding Data Science Code Generation with Input-Output Specifications
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, in the real world, NL is often too ambiguous to capture the true intent behind programming problems, requiring additional input-output (I/O) specifications. Unfortunately, LLMs can have difficulty aligning their outputs with both the NL prompt and the I/O specification."

Progress saved at index: 54


Processing papers:  11%|██████▋                                                       | 55/511 [02:31<24:07,  3.17s/it]

Title: Beyond LLMs: Advancing the Landscape of Complex Reasoning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, in addition to the many deficiencies of LLMs that prevent them from broad industry adoption, such as reliability, cost, and speed, there is a whole class of common real world problems that Large Language Models perform poorly on, namely, constraint satisfaction and optimization problems."

Progress saved at index: 55


Processing papers:  11%|██████▊                                                       | 56/511 [02:36<28:58,  3.82s/it]

Title: Why and When LLM-Based Assistants Can Go Wrong: Investigating the Effectiveness of Prompt-Based Interactions for Software Help-Seeking
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Most users struggled to understand how the prompt's text related to the LLM's responses and often followed the LLM's suggestions verbatim, even if they were incorrect. This resulted in difficulties when using the LLM's advice for software tasks, leading to low task completion rates. Our detailed analysis also revealed that users remained unaware of inaccuracies in the LLM's responses, indicating a

Progress saved at index: 56


Processing papers:  11%|██████▉                                                       | 57/511 [02:39<26:42,  3.53s/it]

Title: Lumos : Empowering Multimodal LLMs with Scene Text Recognition
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "While building Lumos, we encountered numerous challenges related to STR quality, overall latency, and model inference."

Progress saved at index: 57


Processing papers:  11%|███████                                                       | 58/511 [02:41<24:09,  3.20s/it]

Title: Refined Direct Preference Optimization with Synthetic Data for Behavioral Alignment of LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "In this paper, we introduce refined Direct Preference Optimization (rDPO), a method for improving the behavioral alignment of Large Language Models (LLMs) without the need for human-annotated data."

Progress saved at index: 58


Processing papers:  12%|███████▏                                                      | 59/511 [02:44<22:44,  3.02s/it]

Title: Suppressing Pink Elephants with Direct Principle Feedback
Evaluation Result: Title: Suppressing Pink Elephants with Direct Principle Feedback
Paper: Existing methods for controlling language models, such as RLHF and Constitutional AI, involve determining which LLM behaviors are desirable and training them into a language model. However, in many cases, it is desirable for LLMs to be controllable at inference time, so that they can be used in multiple contexts with diverse needs. We illustrate this with the Pink Elephant Problem: instructing an LLM to avoid discussing a certain entity (

Progress saved at index: 59


Processing papers:  12%|███████▎                                                      | 60/511 [02:47<22:34,  3.00s/it]

Title: WildfireGPT: Tailored Large Language Model for Wildfire Analysis
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "LLMs are generalized models, trained on extensive text corpus, and often struggle to provide context-specific information, particularly in areas requiring specialized knowledge such as wildfire details within the broader context of climate change."

Progress saved at index: 60


Processing papers:  12%|███████▍                                                      | 61/511 [02:49<21:12,  2.83s/it]

Title: Policy Improvement using Language Feedback Models
Evaluation Result: Title: Policy Improvement using Language Feedback Models

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "To train LFMs, we obtain feedback from Large Language Models (LLMs) on visual trajectories verbalized to language descriptions."

Progress saved at index: 61


Processing papers:  12%|███████▌                                                      | 62/511 [02:52<20:40,  2.76s/it]

Title: PoisonedRAG: Knowledge Poisoning Attacks to Retrieval-Augmented Generation of Large Language Models
Evaluation Result: Title: PoisonedRAG: Knowledge Poisoning Attacks to Retrieval-Augmented Generation of Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Despite their success, they also have inherent limitations such as a lack of up-to-date knowledge and hallucination."

Progress saved at index: 62


Processing papers:  12%|███████▋                                                      | 63/511 [02:54<20:07,  2.69s/it]

Title: AI-Augmented Predictions: LLM Assistants Improve Human Forecasting Accuracy
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "This study explores the potential of LLMs to augment judgement in forecasting tasks."

Progress saved at index: 63


Processing papers:  13%|███████▊                                                      | 64/511 [02:56<18:48,  2.53s/it]

Title: Lissard: Long and Simple Sequential Reasoning Datasets
Evaluation Result: Title: Lissard: Long and Simple Sequential Reasoning Datasets  
Does it talk about LLMs: Yes.  
Rate Limitations of LLMs: 5.  
Evidence: "Language models are now capable of solving tasks that require dealing with long sequences consisting of hundreds of thousands of tokens. However, they often fail on tasks that require repetitive use of simple rules, even on sequences that are much shorter than those seen during training."

Progress saved at index: 64


Processing papers:  13%|███████▉                                                      | 65/511 [03:02<24:36,  3.31s/it]

Title: Mercury: An Efficiency Benchmark for LLM Code Synthesis
Evaluation Result: Title: Mercury: An Efficiency Benchmark for LLM Code Synthesis
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Our findings reveal that while LLMs demonstrate the remarkable capability to generate functionally correct code, there still exists a substantial gap in their efficiency output, underscoring a new frontier for LLM research and development."

Progress saved at index: 65


Processing papers:  13%|████████                                                      | 66/511 [03:04<23:23,  3.15s/it]

Title: Do Membership Inference Attacks Work on Large Language Models?
Evaluation Result: Title: Do Membership Inference Attacks Work on Large Language Models?

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We find that MIAs barely outperform random guessing for most settings across varying LLM sizes and domains. Our further analyses reveal that this poor performance can be attributed to (1) the combination of a large dataset and few training iterations, and (2) an inherently fuzzy boundary between members and non-members."

Progress saved at index: 66


Processing papers:  13%|████████▏                                                     | 67/511 [03:08<23:56,  3.24s/it]

Title: Differentially Private Zeroth-Order Methods for Scalable Large Language Model Finetuning
Evaluation Result: Title: Differentially Private Zeroth-Order Methods for Scalable Large Language Model Finetuning

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "DP-SGD-based finetuning methods are unfortunately limited by the inherent inefficiency of SGD."

Progress saved at index: 67


Processing papers:  13%|████████▎                                                     | 68/511 [03:10<22:15,  3.02s/it]

Title: Retrieval-Augmented Thought Process as Sequential Decision Making
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, several open challenges hinder their wider application: such as concerns over privacy, tendencies to produce hallucinations, and difficulties in handling long contexts."

Progress saved at index: 68


Processing papers:  14%|████████▎                                                     | 69/511 [03:12<20:24,  2.77s/it]

Title: TELLER: A Trustworthy Framework for Explainable, Generalizable and Controllable Fake News Detection
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "their reliability may be compromised caused by the non-transparent reasoning processes, poor generalization abilities and inherent risks of integration with large language models (LLMs)."

Progress saved at index: 69


Processing papers:  14%|████████▍                                                     | 70/511 [03:15<19:12,  2.61s/it]

Title: Quantitative knowledge retrieval from large language models
Evaluation Result: Title: Quantitative knowledge retrieval from large language models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Implications and challenges of using LLMs as 'experts' are discussed."

Progress saved at index: 70


Processing papers:  14%|████████▌                                                     | 71/511 [03:17<18:02,  2.46s/it]

Title: AIR-Bench: Benchmarking Large Audio-Language Models via Generative Comprehension
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "By revealing the limitations of existing LALMs through evaluation results, AIR-Bench can provide insights into the direction of future research."

Progress saved at index: 71


Processing papers:  14%|████████▋                                                     | 72/511 [03:19<18:04,  2.47s/it]

Title: CyberMetric: A Benchmark Dataset for Evaluating Large Language Models Knowledge in Cybersecurity
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Large Language Models (LLMs) excel across various domains, from computer vision to medical diagnostics."

Progress saved at index: 72


Processing papers:  14%|████████▊                                                     | 73/511 [03:22<17:37,  2.41s/it]

Title: Large Language Models "Ad Referendum": How Good Are They at Machine Translation in the Legal Domain?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "The results indicate that while Google Translate generally outperforms LLMs in AEMs, human evaluators rate LLMs, especially GPT-4, comparably or slightly better in terms of producing contextually adequate and fluent translations."

Progress saved at index: 73


Processing papers:  14%|████████▉                                                     | 74/511 [03:24<18:07,  2.49s/it]

Title: The Sound of Healthcare: Improving Medical Transcription ASR Accuracy with Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "This study explores the potential of Large Language Models (LLMs) to enhance the accuracy of Automatic Speech Recognition (ASR) systems in medical transcription."

Progress saved at index: 74


Processing papers:  15%|█████████                                                     | 75/511 [03:27<18:54,  2.60s/it]

Title: Detecting the Clinical Features of Difficult-to-Treat Depression using Synthetic Data from Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We sought to develop a Large Language Model (LLM)-based tool capable of interrogating routinely-collected, narrative (free-text) electronic health record (EHR) data to locate published prognostic factors that capture the clinical syndrome of DTD."

Progress saved at index: 75


Processing papers:  15%|█████████▏                                                    | 76/511 [03:30<18:32,  2.56s/it]

Title: G-Retriever: Retrieval-Augmented Generation for Textual Graph Understanding and Question Answering
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "To resist hallucination and to allow for textual graphs that greatly exceed the LLM's context window size, G-Retriever performs RAG over a graph by formulating this task as a Prize-Collecting Steiner Tree optimization problem."

Progress saved at index: 76


Processing papers:  15%|█████████▎                                                    | 77/511 [03:32<18:08,  2.51s/it]

Title: Anchor-based Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, the substantial size and parameter volume of these LLMs require massive GPU memory. This memory demand increases with the length of the input text, leading to an urgent need for more efficient methods of information storage and processing."

Progress saved at index: 77


Processing papers:  15%|█████████▍                                                    | 78/511 [03:34<17:54,  2.48s/it]

Title: Step-On-Feet Tuning: Scaling Self-Alignment of LLMs via Bootstrapping
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "our pioneering exploration delves into the impact of bootstrapping self-alignment on large language models."

Progress saved at index: 78


Processing papers:  15%|█████████▌                                                    | 79/511 [03:36<17:00,  2.36s/it]

Title: BreakGPT: A Large Language Model with Multi-stage Structure for Financial Breakout Detection
Evaluation Result: Title: BreakGPT: A Large Language Model with Multi-stage Structure for Financial Breakout Detection
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Recently, large language models have achieved success in various downstream applications, but their effectiveness in the domain of financial breakout detection has been subpar. The reason is that the unique data and specific knowledge are required in breakout detection."

Progress saved at index: 79


Processing papers:  16%|█████████▋                                                    | 80/511 [03:39<17:47,  2.48s/it]

Title: Secret Collusion Among Generative AI Agents
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "While the steganographic capabilities of current models remain limited, GPT-4 displays a capability jump suggesting the need for continuous monitoring of steganographic frontier model capabilities."

Progress saved at index: 80


Processing papers:  16%|█████████▊                                                    | 81/511 [03:42<17:35,  2.45s/it]

Title: T-RAG: Lessons from the LLM Trenches
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "making it robust and a reliable application requires extensive customization and relatively deep knowledge of the application domain."

Progress saved at index: 81


Processing papers:  16%|█████████▉                                                    | 82/511 [03:44<16:40,  2.33s/it]

Title: Food Recommendation as Language Processing (F-RLP): A Personalized and Contextual Paradigm
Evaluation Result: Title: Food Recommendation as Language Processing (F-RLP): A Personalized and Contextual Paradigm
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, a general-purpose Recommendation as Language Processing (RLP) approach lacks the critical components necessary for effective food recommendations."

Progress saved at index: 82


Processing papers:  16%|██████████                                                    | 83/511 [03:46<16:46,  2.35s/it]

Title: Pushing The Limit of LLM Capacity for Text Classification
Evaluation Result: Title: Pushing The Limit of LLM Capacity for Text Classification
Paper: The value of text classification's future research has encountered challenges and uncertainties, due to the extraordinary efficacy demonstrated by large language models (LLMs) across numerous downstream NLP tasks. In this era of open-ended language modeling, where task boundaries are gradually fading, an urgent question emerges: have we made significant advances in text classification under the full benefit of LLMs? To answer this question, we propose RGPT, an adaptive

Progress saved at index: 83


Processing papers:  16%|██████████▏                                                   | 84/511 [03:51<23:16,  3.27s/it]

Title: Game Agent Driven by Free-Form Text Command: Using LLM-based Code Generation and Behavior Branch
Evaluation Result: Title: Game Agent Driven by Free-Form Text Command: Using LLM-based Code Generation and Behavior Branch

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "The proposed system uses a large language model (LLM) for code generation to interpret and transform natural language commands into behavior branch, a proposed knowledge expression based on behavior trees, which facilitates execution by the game agent."

Progress saved at index: 84


Processing papers:  17%|██████████▎                                                   | 85/511 [03:54<21:38,  3.05s/it]

Title: Benchmarking and Building Long-Context Retrieval Models with LoCo and M2-BERT
Evaluation Result: Title: Benchmarking and Building Long-Context Retrieval Models with LoCo and M2-BERT
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Developing long-context retrieval encoders suitable for these domains raises three challenges: (1) how to evaluate long-context retrieval performance, (2) how to pretrain a base language model to represent both short contexts (corresponding to queries) and long contexts (corresponding

Progress saved at index: 85


Processing papers:  17%|██████████▍                                                   | 86/511 [03:57<21:17,  3.00s/it]

Title: Large Language Models are Few-shot Generators: Proposing Hybrid Prompt Algorithm To Generate Webshell Escape Samples
Evaluation Result: Title: Large Language Models are Few-shot Generators: Proposing Hybrid Prompt Algorithm To Generate Webshell Escape Samples

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "we propose the Hybrid Prompt algorithm for webshell escape sample generation with the help of large language models."

Progress saved at index: 86


Processing papers:  17%|██████████▌                                                   | 87/511 [03:59<20:07,  2.85s/it]

Title: Dólares or Dollars? Unraveling the Bilingual Prowess of Financial LLMs Between Spanish and English
Evaluation Result: Title: Dólares or Dollars? Unraveling the Bilingual Prowess of Financial LLMs Between Spanish and English
Paper: Despite Spanish's pivotal role in the global finance industry, a pronounced gap exists in Spanish financial natural language processing (NLP) and application studies compared to English, especially in the era of large language models (LLMs). To bridge this gap, we unveil Tois\'on de Oro, the first bilingual framework that establishes instruction datasets, finetuned

Progress saved at index: 87


Processing papers:  17%|██████████▋                                                   | 88/511 [04:02<19:41,  2.79s/it]

Title: Can LLMs Produce Faithful Explanations For Fact-checking? Towards Faithful Explainable Fact-Checking via Multi-Agent Debate
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "While Large Language Models (LLMs) excel in text generation, their capability for producing faithful explanations in fact-checking remains underexamined. Our study investigates LLMs' ability to generate such explanations, finding that zero-shot prompts often result in unfaithfulness."

Progress saved at index: 88


Processing papers:  17%|██████████▊                                                   | 89/511 [04:05<19:11,  2.73s/it]

Title: Exploring Perceptual Limitation of Multimodal Large Language Models
Evaluation Result: Title: Exploring Perceptual Limitation of Multimodal Large Language Models
Paper: Multimodal Large Language Models (MLLMs) have recently shown remarkable perceptual capability in answering visual questions, however, little is known about the limits of their perception. In particular, while prior works have provided anecdotal evidence of MLLMs' sensitivity to object size, this phenomenon and its underlying causes have not been explored comprehensively. In this work, we quantitatively study the perception of small visual

Progress saved at index: 89


Processing papers:  18%|██████████▉                                                   | 90/511 [04:07<18:38,  2.66s/it]

Title: Assessing Generalization for Subpopulation Representative Modeling via In-Context Learning
Evaluation Result: Title: Assessing Generalization for Subpopulation Representative Modeling via In-Context Learning

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "the benefit of in-context learning varies considerably across demographics, sometimes hurting performance for one demographic while helping performance for others" and "The inequitable benefits of in-context learning for SRM present a challenge for practitioners implementing SRMs."

Progress saved at index: 90


Processing papers:  18%|███████████                                                   | 91/511 [04:10<18:30,  2.64s/it]

Title: Differentially Private Training of Mixture of Experts Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, this growth raises significant computational and privacy concerns."

Progress saved at index: 91


Processing papers:  18%|███████████▏                                                  | 92/511 [04:12<16:59,  2.43s/it]

Title: Summing Up the Facts: Additive Mechanisms Behind Factual Recall in LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We find that the mechanistic story behind factual recall is more complex than previously thought."

Progress saved at index: 92


Processing papers:  18%|███████████▎                                                  | 93/511 [04:14<16:21,  2.35s/it]

Title: ODIN: Disentangled Reward Mitigates Hacking in RLHF
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "A well-formatted, verbose but less helpful response from the LLMs can often deceive LLMs or even human evaluators to achieve high scores."

Progress saved at index: 93


Processing papers:  18%|███████████▍                                                  | 94/511 [04:16<16:40,  2.40s/it]

Title: A Theoretical Analysis of Nash Learning from Human Feedback under General KL-Regularized Preference
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "the reward-based RLHF is limited in expressivity and cannot capture the real-world complicated human preference."

Progress saved at index: 94


Processing papers:  19%|███████████▌                                                  | 95/511 [04:18<16:04,  2.32s/it]

Title: How do Large Language Models Navigate Conflicts between Honesty and Helpfulness?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We find that reinforcement learning from human feedback improves both honesty and helpfulness, while chain-of-thought prompting skews LLMs towards helpfulness over honesty."

Progress saved at index: 95


Processing papers:  19%|███████████▋                                                  | 96/511 [04:21<16:03,  2.32s/it]

Title: TransGPT: Multi-modal Generative Pre-trained Transformer for Transportation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "This paper presents TransGPT, a novel (multi-modal) large language model for the transportation domain, which consists of two independent variants: TransGPT-SM for single-modal data and TransGPT-MM for multi-modal data."

Progress saved at index: 96


Processing papers:  19%|███████████▊                                                  | 97/511 [04:24<16:53,  2.45s/it]

Title: Beware of Words: Evaluating the Lexical Richness of Conversational Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "This means that for example, if conversational LLMs do not use a word it may become less and less frequent and eventually stop being used altogether."

Progress saved at index: 97


Processing papers:  19%|███████████▉                                                  | 98/511 [04:26<16:32,  2.40s/it]

Title: Synergizing Spatial Optimization with Large Language Models for Open-Domain Urban Itinerary Planning
Evaluation Result: Title: Synergizing Spatial Optimization with Large Language Models for Open-Domain Urban Itinerary Planning
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, due to non-real-time information, incomplete knowledge, and insufficient spatial awareness, they are unable to independently deliver a satisfactory user experience in OUIP."

Progress saved at index: 98


Processing papers:  19%|████████████                                                  | 99/511 [04:29<18:21,  2.67s/it]

Title: GraphTranslator: Aligning Graph Model to Large Language Model for Open-ended Tasks
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Although several methods applying LLMs to graphs have been proposed, they fail to simultaneously handle the pre-defined and open-ended tasks, with LLM as a node feature enhancer or as a standalone predictor."

Progress saved at index: 99


Processing papers:  20%|███████████▉                                                 | 100/511 [04:32<18:00,  2.63s/it]

Title: Prompt Perturbation in Retrieval-Augmented Generation based Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We find that the insertion of even a short prefix to the prompt leads to the generation of outputs far away from factually correct answers."

Progress saved at index: 100


Processing papers:  20%|████████████                                                 | 101/511 [04:35<18:44,  2.74s/it]

Title: Large-Language-Model Empowered Dose Volume Histogram Prediction for Intensity Modulated Radiotherapy
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "facilitated by a large-language model (LLM) to enhance the planning quality" and "the LLM-empowered DoseGNN model facilitates seamless adjustment to treatment plans through interaction with clinicians using natural language."

Progress saved at index: 101


Processing papers:  20%|████████████▏                                                | 102/511 [04:37<18:12,  2.67s/it]

Title: Social Evolution of Published Text and The Emergence of Artificial Intelligence Through Large Language Models and The Problem of Toxicity and Bias
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.

Progress saved at index: 102


Processing papers:  20%|████████████▎                                                | 103/511 [04:40<18:13,  2.68s/it]

Title: Effort and Size Estimation in Software Projects with Large Language Model-based Intelligent Interfaces
Evaluation Result: Title: Effort and Size Estimation in Software Projects with Large Language Model-based Intelligent Interfaces

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "inclusion of LLM-based AI agents in software design often poses unexpected challenges, especially in the estimation of development efforts."

Progress saved at index: 103


Processing papers:  20%|████████████▍                                                | 104/511 [04:42<17:45,  2.62s/it]

Title: Natural Language Reinforcement Learning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We present how NLRL can be practically implemented with the latest advancements in large language models (LLMs) like GPT-4."

Progress saved at index: 104


Processing papers:  21%|████████████▌                                                | 105/511 [04:45<18:16,  2.70s/it]

Title: Graph Descriptive Order Improves Reasoning with Large Language Model
Evaluation Result: **Title: Graph Descriptive Order Improves Reasoning with Large Language Model**

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, the progress in the field of graph reasoning with LLM remains limited." and "We discover that the graph reasoning performance of LLMs does not monotonically decrease with the increase in graph size."

Progress saved at index: 105


Processing papers:  21%|████████████▋                                                | 106/511 [04:49<20:58,  3.11s/it]

Title: Using Large Language Models for Student-Code Guided Test Case Generation in Computer Science Education
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "In this work, we propose a large language model-based approach to automatically generate test cases and show that they are good measures of student knowledge."

Progress saved at index: 106


Processing papers:  21%|████████████▊                                                | 107/511 [04:53<21:48,  3.24s/it]

Title: Exploring the Impact of Large Language Models on Recommender Systems: An Extensive Review
Evaluation Result: Title: Exploring the Impact of Large Language Models on Recommender Systems: An Extensive Review

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Despite their transformative potential, challenges persist, including sensitivity to input prompts, occasional misinterpretations, and unforeseen recommendations, necessitating continuous refinement and evolution in LLM-driven recommender systems."

Progress saved at index: 107


Processing papers:  21%|████████████▉                                                | 108/511 [04:56<22:30,  3.35s/it]

Title: A Tale of Tails: Model Collapse as a Change of Scaling Laws
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We discover a wide range of decay phenomena, analyzing loss of scaling, shifted scaling with number of generations, the ''un-learning" of skills, and grokking when mixing human and synthesized data."

Progress saved at index: 108


Processing papers:  21%|█████████████                                                | 109/511 [05:06<34:21,  5.13s/it]

Title: Fiddler: CPU-GPU Orchestration for Fast Inference of Mixture-of-Experts Models
Evaluation Result: Title: Fiddler: CPU-GPU Orchestration for Fast Inference of Mixture-of-Experts Models  
Does it talk about LLMs: Yes.  
Rate Limitations of LLMs: 2.  
Evidence: "running them on resource-constrained settings, where GPU memory resources are not abundant, is challenging due to huge model sizes."

Progress saved at index: 109


Processing papers:  22%|█████████████▎                                               | 111/511 [05:24<51:00,  7.65s/it]

Error during API call: HTTP code 502 from API (<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
)
Skipping paper at index 110 due to API call failure.
Title: REALM: RAG-Driven Enhancement of Multimodal Electronic Health Records Analysis via Large Language Models
Evaluation Result: Title: REALM: RAG-Driven Enhancement of Multimodal Electronic Health Records Analysis via Large Language Models

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "address these limitations" and "eliminates hallucinations and ensures consistency."

Progress saved at index: 111


Processing papers:  22%|█████████████▎                                               | 112/511 [05:32<51:21,  7.72s/it]

Title: DAEDRA: A language model for predicting outcomes in passive pharmacovigilance reporting
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Generic language models may not capture the complex clinical dimensions while specific clinical or biomedical models may not perform well on lay reports."

Progress saved at index: 112


Processing papers:  22%|█████████████▍                                               | 113/511 [05:41<54:11,  8.17s/it]

Title: A Thorough Examination of Decoding Methods in the Era of LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Our findings reveal that decoding method performance is notably task-dependent and influenced by factors such as alignment, model size, and quantization. Intriguingly, sensitivity analysis exposes that certain methods achieve superior performance at the cost of extensive hyperparameter tuning, highlighting the trade-off between attaining optimal results and the practicality of implementation in varying contexts."

Progress saved at index: 113


Processing papers:  23%|█████████████▎                                             | 115/511 [06:03<1:08:57, 10.45s/it]

Error during API call: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Skipping paper at index 114 due to API call failure.
Title: Generating Chain-of-Thoughts with a Direct Pairwise-Comparison Approach to Searching for the Most Promising Intermediate Thought
Evaluation Result: **Title: Generating Chain-of-Thoughts with a Direct Pairwise-Comparison Approach to Searching for the Most Promising Intermediate Thought**

- Does it talk about LLMs: Yes.
- Rate Limitations of LLMs: 4.
- Evidence: "a widespread yet understudied problem is that the evaluation from the LLM is typically noisy and unreliable, potentially misleading the generation process in selecting promising intermediate thoughts."

Progress saved at index: 115


Processing papers:  23%|█████████████▍                                             | 116/511 [06:23<1:26:30, 13.14s/it]

Title: Can LLMs Recognize Toxicity? Structured Toxicity Investigation Framework and Semantic-Based Metric
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "The majority of existing toxicity metrics rely on encoder models trained on specific toxicity datasets. However, these encoders are susceptible to out-of-distribution (OOD) problems and depend on the definition of toxicity assumed in a dataset."

Progress saved at index: 116


Processing papers:  23%|█████████████▌                                             | 117/511 [06:33<1:20:24, 12.25s/it]

Title: UrbanKGent: A Unified Large Language Model Agent Framework for Urban Knowledge Graph Construction
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "UrbanKGent-13B not only can significantly outperform 21 baselines in UrbanKGC tasks, but also surpass the state-of-the-art LLM, GPT-4, by more than 10% with approximately 20 times lower cost."

Progress saved at index: 117


Processing papers:  23%|█████████████▌                                             | 118/511 [06:37<1:03:30,  9.70s/it]

Title: History, Development, and Principles of Large Language Models-An Introductory Survey
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "The survey also highlights the limitations of existing work and points out promising future directions."

Progress saved at index: 118


Processing papers:  23%|██████████████▏                                              | 119/511 [06:39<49:22,  7.56s/it]

Title: ChemLLM: A Chemical Large Language Model
Evaluation Result: Title: ChemLLM: A Chemical Large Language Model

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "the direct use of these structured data compromises the model's ability to maintain coherent dialogue."

Progress saved at index: 119


Processing papers:  23%|██████████████▎                                              | 120/511 [06:42<38:57,  5.98s/it]

Title: Forecasting Events in Soccer Matches Through Language
Evaluation Result: Title: Forecasting Events in Soccer Matches Through Language

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "a challenge bearing remarkable similarities to the problem faced by Large Language Models (LLMs)."

Progress saved at index: 120


Processing papers:  24%|██████████████▍                                              | 121/511 [06:44<32:02,  4.93s/it]

Title: The Unreasonable Effectiveness of Eccentric Automatic Prompts
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Large Language Models (LLMs) have demonstrated remarkable problem-solving and basic mathematics abilities. However, their efficacy is highly contingent on the formulation of the prompt."

Progress saved at index: 121


Processing papers:  24%|██████████████▌                                              | 122/511 [06:46<26:56,  4.16s/it]

Title: Estimating Player Performance in Different Contexts Using Fine-tuned Large Events Models
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses Large Event Models (LEMs) in the context of soccer analytics and does not mention language models (LLMs).

Progress saved at index: 122


Processing papers:  24%|██████████████▋                                              | 123/511 [06:49<23:43,  3.67s/it]

Title: Is it safe to cross? Interpretable Risk Assessment with GPT-4V for Safety-Aware Street Crossing
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "This paper introduces an innovative approach that leverages large multimodal models (LMMs) to interpret complex street crossing scenes, offering a potential advancement over conventional traffic signal recognition techniques."

Progress saved at index: 123


Processing papers:  24%|██████████████▊                                              | 124/511 [06:52<21:44,  3.37s/it]

Title: Debating with More Persuasive LLMs Leads to More Truthful Answers
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Common methods for aligning large language models (LLMs) with desired behaviour heavily rely on human-labelled data."

Progress saved at index: 124


Processing papers:  24%|██████████████▉                                              | 125/511 [06:54<19:27,  3.03s/it]

Title: GLaM: Fine-Tuning Large Language Models for Domain Knowledge Graph Alignment via Neighborhood Partitioning and Generative Subgraph Encoding
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "their ability to reason over domain-specialized graphs of interconnected entities remains limited" and "The answer is no--such capabilities lie beyond current methods."

Progress saved at index: 125


Processing papers:  25%|███████████████                                              | 126/511 [06:56<18:04,  2.82s/it]

Title: EntGPT: Linking Generative Large Language Models with Knowledge Bases
Evaluation Result: Title: EntGPT: Linking Generative Large Language Models with Knowledge Bases

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "The ability of Large Language Models (LLMs) to generate factually correct output remains relatively unexplored due to the lack of fact-checking and knowledge grounding during training and inference."

Progress saved at index: 126


Processing papers:  25%|███████████████▏                                             | 127/511 [06:59<17:44,  2.77s/it]

Title: NICE: To Optimize In-Context Examples or Not?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We challenge this consensus by investigating the necessity of optimizing ICE when task-specific instructions are provided and find that there are tasks for which it yields diminishing returns."

Progress saved at index: 127


Processing papers:  25%|███████████████▎                                             | 128/511 [07:02<17:38,  2.76s/it]

Title: Feedback Loops With Language Models Drive In-Context Reward Hacking
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "we show that feedback loops can cause in-context reward hacking (ICRH), where the LLM at test-time optimizes a (potentially implicit) objective but creates negative side effects in the process."

Progress saved at index: 128


Processing papers:  25%|███████████████▍                                             | 129/511 [07:06<20:12,  3.17s/it]

Title: Understanding the Effects of Iterative Prompting on Truthfulness
Evaluation Result: Title: Understanding the Effects of Iterative Prompting on Truthfulness

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Yet, the reliability and truthfulness of these models remain pressing concerns." and "naive prompting methods significantly undermine truthfulness, leading to exacerbated calibration errors."

Progress saved at index: 129


Processing papers:  25%|███████████████▌                                             | 130/511 [07:09<19:58,  3.15s/it]

Title: If Turing played piano with an artificial partner
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses neural network architectures and generative models for producing musical scores but does not specifically mention language models (LLMs or LMs).

Progress saved at index: 130


Processing papers:  26%|███████████████▋                                             | 131/511 [07:11<18:13,  2.88s/it]

Title: TIC: Translate-Infer-Compile for accurate 'text to plan' using LLMs and logical intermediate representations
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "LLMs excel at natural language processing but do not perform well on planning."

Progress saved at index: 131


Processing papers:  26%|███████████████▊                                             | 132/511 [07:13<16:43,  2.65s/it]

Title: On the Out-Of-Distribution Generalization of Multimodal Large Language Models
Evaluation Result: Title: On the Out-Of-Distribution Generalization of Multimodal Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Empirical results indicate that MLLMs struggle with generalization beyond common training domains, limiting their direct application without adaptation." and "We further explore the robustness of ICL under distribution shifts and show its vulnerability to domain shifts, label shifts, and spurious correlation shifts between in-context examples

Progress saved at index: 132


Processing papers:  26%|███████████████▉                                             | 133/511 [07:17<19:01,  3.02s/it]

Title: Understanding the Weakness of Large Language Model Agents within a Complex Android Environment
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "LLM agents face three primary challenges," "even state-of-the-art LLM agents struggle in cross-APP scenarios and adhering to specific constraints," and "a lack of four key capabilities, i.e., understanding, reasoning, exploration, and reflection, as primary reasons for the failure of LLM agents."

Progress saved at index: 133


Processing papers:  26%|███████████████▉                                             | 134/511 [07:20<18:38,  2.97s/it]

Title: G-SciEdBERT: A Contextualized LLM for Science Assessment Tasks in German
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Automatically scoring written responses to science questions in German is a complex task and challenging for standard G-BERT as they lack contextual knowledge in the science domain and may be unaligned with student writing styles."

Progress saved at index: 134


Processing papers:  26%|████████████████                                             | 135/511 [07:22<17:38,  2.82s/it]

Title: The Quantified Boolean Bayesian Network: Theory and Experiments with a Logical Graphical Model
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "The QBBN is meant to address a central problem with the Large Language Model (LLM), which has become extremely popular in Information Retrieval, which is that the LLM hallucinates."

Progress saved at index: 135


Processing papers:  27%|████████████████▏                                            | 136/511 [07:25<16:58,  2.72s/it]

Title: Calibrating Long-form Generations from Large Language Models
Evaluation Result: Title: Calibrating Long-form Generations from Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "larger models don't necessarily guarantee better calibration," and "calibration performance is found to be metric-dependent."

Progress saved at index: 136


Processing papers:  27%|████████████████▎                                            | 137/511 [07:27<16:42,  2.68s/it]

Title: Introspective Planning: Guiding Language-Enabled Agents to Refine Their Own Uncertainty
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, LLM hallucination may result in robots confidently executing plans that are misaligned with user goals or, in extreme cases, unsafe. Additionally, inherent ambiguity in natural language instructions can induce task uncertainty, particularly in situations where multiple valid options exist."

Progress saved at index: 137


Processing papers:  27%|████████████████▍                                            | 138/511 [07:31<18:21,  2.95s/it]

Title: Large Language Models for Captioning and Retrieving Remote Sensing Images
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "the development and application of vision and language models to the remote sensing domain have been hindered by the relatively small size of the available datasets and models used in previous studies."

Progress saved at index: 138


Processing papers:  27%|████████████████▌                                            | 139/511 [07:34<17:39,  2.85s/it]

Title: V-STaR: Training Verifiers for Self-Taught Reasoners
Evaluation Result: Title: V-STaR: Training Verifiers for Self-Taught Reasoners
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "These approaches discard the large amounts of incorrect solutions generated during this process, potentially neglecting valuable information in such solutions."

Progress saved at index: 139


Processing papers:  27%|████████████████▋                                            | 140/511 [07:37<18:09,  2.94s/it]

Title: CoSearchAgent: A Lightweight Collaborative Search Agent with Large Language Models
Evaluation Result: Title: CoSearchAgent: A Lightweight Collaborative Search Agent with Large Language Models

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "In recent years, large language models (LLMs) have been demonstrated to interact naturally with users and achieve complex information-seeking tasks through LLM-based agents."

Progress saved at index: 140


Processing papers:  28%|████████████████▊                                            | 141/511 [07:39<17:30,  2.84s/it]

Title: RareBench: Can LLMs Serve as Rare Diseases Specialists?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Generalist Large Language Models (LLMs), such as GPT-4, have shown considerable promise in various domains, including medical diagnosis." and "Our experimental findings underscore the promising potential of integrating LLMs into the clinical diagnostic process for rare diseases."

Progress saved at index: 141


Processing papers:  28%|████████████████▉                                            | 142/511 [07:42<16:56,  2.75s/it]

Title: ExaRanker-Open: Synthetic Explanation for IR using Open-Source LLMs
Evaluation Result: Title: ExaRanker-Open: Synthetic Explanation for IR using Open-Source LLMs
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, the initial results were based on proprietary language models such as GPT-3.5, which posed constraints on dataset size due to its cost and data privacy."

Progress saved at index: 142


Processing papers:  28%|█████████████████                                            | 143/511 [07:44<16:24,  2.68s/it]

Title: InternLM-Math: Open Math Large Language Models Toward Verifiable Reasoning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "The math abilities of large language models can represent their abstract reasoning ability."

Progress saved at index: 143


Processing papers:  28%|█████████████████▏                                           | 144/511 [07:46<15:15,  2.49s/it]

Title: Zero-shot Explainable Mental Health Analysis on Social Media by Incorporating Mental Scales
Evaluation Result: Title: Zero-shot Explainable Mental Health Analysis on Social Media by Incorporating Mental Scales
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "The generative approaches, such as those based on large language models (LLMs), have the potential to get rid of heavy annotations and provide explanations but their capabilities still fall short compared to discriminative approaches, and their explanations may be unreliable due to the fact that the generation of explanation is

Progress saved at index: 144


Processing papers:  28%|█████████████████▎                                           | 145/511 [07:49<15:51,  2.60s/it]

Title: LLaVA-Docent: Instruction Tuning with Multimodal Large Language Model to Support Art Appreciation Education
Evaluation Result: **Title:** LLaVA-Docent: Instruction Tuning with Multimodal Large Language Model to Support Art Appreciation Education

**Does it talk about LLMs:** Yes.

**Rate Limitations of LLMs:** 1.

**Evidence:** "The evaluation process revealed distinct strengths and weaknesses of the LLaVA-Docent model."

Progress saved at index: 145


Processing papers:  29%|█████████████████▍                                           | 146/511 [07:52<15:55,  2.62s/it]

Title: On the Efficacy of Eviction Policy for Key-Value Constrained Generative Language Model Inference
Evaluation Result: **Title: On the Efficacy of Eviction Policy for Key-Value Constrained Generative Language Model Inference**
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Despite the recent success associated with Large Language Models (LLMs), they are notably cost-prohibitive to deploy in resource-constrained environments due to their excessive memory and computational demands."

Progress saved at index: 146


Processing papers:  29%|█████████████████▌                                           | 147/511 [07:55<16:18,  2.69s/it]

Title: Studious Bob Fight Back Against Jailbreaking via Prompt Adversarial Tuning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "they are also susceptible to certain prompts that can induce them to bypass built-in safety measures and provide dangerous or illegal content, a phenomenon known as jailbreak."

Progress saved at index: 147


Processing papers:  29%|█████████████████▋                                           | 148/511 [07:57<15:43,  2.60s/it]

Title: Entropy-Regularized Token-Level Policy Optimization for Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Nonetheless, it faces significant hurdles: 1) instability stemming from the exponentially vast action space requiring exploration; 2) challenges in assigning token-level credit based on action-level reward signals, resulting in discord between maximizing rewards and accurately modeling corpus data."

Progress saved at index: 148


Processing papers:  29%|█████████████████▊                                           | 149/511 [08:00<15:25,  2.56s/it]

Title: Exploring Interaction Patterns for Debugging: Enhancing Conversational Capabilities of AI-assistants
Evaluation Result: Title: Exploring Interaction Patterns for Debugging: Enhancing Conversational Capabilities of AI-assistants

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "LLMs often leap to action without sufficient context, giving rise to implicit assumptions and inaccurate responses."

Progress saved at index: 149


Processing papers:  29%|█████████████████▉                                           | 150/511 [08:03<16:39,  2.77s/it]

Title: ResumeFlow: An LLM-facilitated Pipeline for Personalized Resume Generation and Refinement
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We propose ResumeFlow: a Large Language Model (LLM) aided tool that enables an end user to simply provide their detailed resume and the desired job posting, and obtain a personalized resume specifically tailored to that specific job posting in the matter of a few seconds."

Progress saved at index: 150


Processing papers:  30%|██████████████████                                           | 151/511 [08:05<16:10,  2.70s/it]

Title: The Generative AI Paradox on Evaluation: What It Can Solve, It May Not Evaluate
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Results indicate a significant disparity, with LLMs exhibiting lower performance in evaluation tasks compared to generation tasks." and "underscoring the need to examine the faithfulness and trustworthiness of LLMs as evaluators."

Progress saved at index: 151


Processing papers:  30%|██████████████████▏                                          | 152/511 [08:08<16:19,  2.73s/it]

Title: Large Language Models: A Survey
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We review some of the most prominent LLMs, including three popular LLM families (GPT, LLaMA, PaLM), and discuss their characteristics, contributions and limitations."

Progress saved at index: 152


Processing papers:  30%|██████████████████▎                                          | 153/511 [08:11<16:02,  2.69s/it]

Title: CultureLLM: Incorporating Cultural Differences into Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Large language models (LLMs) are reported to be partial to certain cultures owing to the training data dominance from the English corpora." and "Since multilingual cultural data are often expensive to collect, existing efforts handle this by prompt engineering or culture-specific pre-training. However, they might overlook the knowledge deficiency of low-resource culture and require extensive computing resources."

Progress saved at index: 153


Processing papers:  30%|██████████████████▍                                          | 154/511 [08:14<16:29,  2.77s/it]

Title: Learn To be Efficient: Build Structured Sparsity in Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Large Language Models (LLMs) have achieved remarkable success with their billion-level parameters, yet they incur high inference overheads."

Progress saved at index: 154


Processing papers:  30%|██████████████████▌                                          | 155/511 [08:16<15:22,  2.59s/it]

Title: Exploring Group and Symmetry Principles in Large Language Models
Evaluation Result: **Title:** Exploring Group and Symmetry Principles in Large Language Models

**Paper:** Large Language Models (LLMs) have demonstrated impressive performance across a wide range of applications; however, assessing their reasoning capabilities remains a significant challenge. In this paper, we introduce a framework grounded in group and symmetry principles, which have played a crucial role in fields such as physics and mathematics, and offer another way to evaluate their capabilities. While the proposed framework is general, to showcase the benefits of employing these properties,

Progress saved at index: 155


Processing papers:  31%|██████████████████▌                                          | 156/511 [08:19<15:37,  2.64s/it]

Title: ContPhy: Continuum Physical Concept Learning and Reasoning from Videos
Evaluation Result: Title: ContPhy: Continuum Physical Concept Learning and Reasoning from Videos
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We also introduce an oracle model (ContPRO) that marries the particle-based physical dynamic models with the recent large language models, which enjoy the advantages of both models, precise dynamic predictions, and interpretable reasoning."

Progress saved at index: 156


Processing papers:  31%|██████████████████▋                                          | 157/511 [08:21<15:34,  2.64s/it]

Title: ViGoR: Improving Visual Grounding of Large Vision Language Models with Fine-Grained Reward Modeling
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "the generated text often suffers from inaccurate grounding in the visual input, resulting in errors such as hallucinating nonexistent scene elements, missing significant parts of the scene, and inferring incorrect attributes and relationships between objects."

Progress saved at index: 157


Processing papers:  31%|██████████████████▊                                          | 158/511 [08:24<15:45,  2.68s/it]

Title: LLMs for Coding and Robotics Education
Evaluation Result: Title: LLMs for Coding and Robotics Education

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Our results show that GPT-4V outperforms other models in all of our tests but struggles with generating block diagram images."

Progress saved at index: 158


Processing papers:  31%|██████████████████▉                                          | 159/511 [08:26<15:06,  2.57s/it]

Title: SubGen: Token Generation in Sublinear Time and Memory
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Despite the significant success of large language models (LLMs), their extensive memory requirements pose challenges for deploying them in long-context token generation."

Progress saved at index: 159


Processing papers:  31%|███████████████████                                          | 160/511 [08:28<14:08,  2.42s/it]

Title: Large Language Model Augmented Exercise Retrieval for Personalized Language Learning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "vector similarity approaches poorly capture the relationship between exercise content and the language that learners use to express what they want to learn. This semantic gap between queries and content dramatically reduces the effectiveness of general-purpose retrieval models pretrained on large scale information retrieval datasets like MS MARCO."

Progress saved at index: 160


Processing papers:  32%|███████████████████▏                                         | 161/511 [08:31<14:40,  2.52s/it]

Title: OpenToM: A Comprehensive Benchmark for Evaluating Theory-of-Mind Reasoning Capabilities of Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Using OpenToM, we reveal that state-of-the-art LLMs thrive at modeling certain aspects of mental states in the physical world but fall short when tracking characters' mental states in the psychological world."

Progress saved at index: 161


Processing papers:  32%|███████████████████▎                                         | 162/511 [08:34<14:27,  2.49s/it]

Title: A Prompt Response to the Demand for Automatic Gender-Neutral Translation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Through extensive manual analyses, our study empirically reveals the inherent limitations of current MT systems in generating GNTs and provides valuable insights into the potential and challenges associated with prompting for neutrality."

Progress saved at index: 162


Processing papers:  32%|███████████████████▍                                         | 163/511 [08:36<14:35,  2.52s/it]

Title: Exploring Visual Culture Awareness in GPT-4V: A Comprehensive Probing
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Experimental results indicate that GPT-4V excels at identifying cultural concepts but still exhibits weaker performance in low-resource languages, such as Tamil and Swahili."

Progress saved at index: 163


Processing papers:  32%|███████████████████▌                                         | 164/511 [08:39<14:40,  2.54s/it]

Title: LLMs Among Us: Generative AI Participating in Digital Discourse
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "While this can bring promising opportunities, it also raises many threats, such as biases and privacy concerns, and may contribute to the spread of propaganda by malicious actors."

Progress saved at index: 164


Processing papers:  32%|███████████████████▋                                         | 165/511 [08:41<14:43,  2.55s/it]

Title: WebLINX: Real-World Website Navigation with Multi-Turn Dialogue
Evaluation Result: Title: WebLINX: Real-World Website Navigation with Multi-Turn Dialogue
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Due to the magnitude of information present, Large Language Models (LLMs) cannot process entire web pages in real-time." and "However, all finetuned models struggle to generalize to unseen websites."

Progress saved at index: 165


Processing papers:  32%|███████████████████▊                                         | 166/511 [08:44<14:50,  2.58s/it]

Title: On the Convergence of Zeroth-Order Federated Tuning for Large Language Models
Evaluation Result: Title: On the Convergence of Zeroth-Order Federated Tuning for Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "the intensive memory requirements for fine-tuning LLMs pose significant challenges, especially when deploying on clients with limited computational resources."

Progress saved at index: 166


Processing papers:  33%|███████████████████▉                                         | 167/511 [08:49<18:45,  3.27s/it]

Title: Efficient Stagewise Pretraining via Progressive Subnetworks
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "it has limitations, particularly the inability to evaluate the full model during earlier stages, and degradation in model quality due to smaller model capacity in the initial stages."

Progress saved at index: 167


Processing papers:  33%|████████████████████                                         | 168/511 [08:52<18:01,  3.15s/it]

Title: FACT-GPT: Fact-Checking Augmentation via Claim Matching with LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "FACT-GPT, trained on a synthetic dataset, identifies social media content that aligns with, contradicts, or is irrelevant to previously debunked claims."

Progress saved at index: 168


Processing papers:  33%|████████████████████▏                                        | 169/511 [08:54<16:56,  2.97s/it]

Title: Large Language Model Meets Graph Neural Network in Knowledge Distillation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "the deployment of LLMs for production is hindered by its high computational and storage requirements, as well as long latencies during model inference."

Progress saved at index: 169


Processing papers:  33%|████████████████████▎                                        | 170/511 [08:57<16:02,  2.82s/it]

Title: CREMA: Multimodal Compositional Video Reasoning via Efficient Modular Adaptation and Fusion
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "It projects diverse modality features to the LLM token embedding space, allowing the model to integrate different data types for response generation."

Progress saved at index: 170


Processing papers:  33%|████████████████████▍                                        | 171/511 [09:00<15:48,  2.79s/it]

Title: Generative Echo Chamber? Effects of LLM-Powered Search Systems on Diverse Information Seeking
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "little is known about such a risk of LLM-powered conversational search" and "participants engaged in more biased information querying with LLM-powered conversational search, and an opinionated LLM reinforcing their views exacerbated this bias."

Progress saved at index: 171


Processing papers:  34%|████████████████████▌                                        | 172/511 [09:04<18:40,  3.31s/it]

Title: EmojiCrypt: Prompt Encryption for Secure Communication with Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "they also introduce significant privacy concerns: the transmission and storage of user data in cloud infrastructures pose substantial risks of data breaches and unauthorized access to sensitive information; even if the transmission and storage of data is encrypted, the LLM service provider itself still knows the real contents of the data, preventing individuals or entities from confidently using such LLM services."

Progress saved at index: 172


Processing papers:  34%|████████████████████▋                                        | 173/511 [09:07<18:46,  3.33s/it]

Title: How Well Can LLMs Negotiate? NegotiationArena Platform and Analysis
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We also quantify irrational negotiation behaviors exhibited by the LLM agents, many of which also appear in humans."

Progress saved at index: 173


Processing papers:  34%|████████████████████▊                                        | 174/511 [09:10<17:53,  3.18s/it]

Title: Is it Possible to Edit Large Language Models Robustly?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, the robustness of model editing remains an open question." and "Our experimental results uncover a substantial disparity between existing editing methods and the practical application of LLMs." and "On rephrased prompts that are complex and flexible but common in realistic applications, the performance of editing experiences a significant decline."

Progress saved at index: 174


Processing papers:  34%|████████████████████▉                                        | 175/511 [09:13<16:58,  3.03s/it]

Title: Training Large Language Models for Reasoning through Reverse Curriculum Reinforcement Learning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "The core challenge in applying RL to complex reasoning is to identify a sequence of actions that result in positive rewards and provide appropriate supervision for optimization. Outcome supervision provides sparse rewards for final results without identifying error locations, whereas process supervision offers step-wise rewards but requires extensive manual annotation."

Progress saved at index: 175


Processing papers:  34%|█████████████████████                                        | 176/511 [09:16<16:30,  2.96s/it]

Title: Limits of Transformer Language Models on Learning Algorithmic Compositions
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We observe that the compositional capabilities of state-of-the-art Transformer language models are very limited and sample-wise scale worse than relearning all sub-tasks for a new algorithmic composition."

Progress saved at index: 176


Processing papers:  35%|█████████████████████▏                                       | 177/511 [09:18<15:38,  2.81s/it]

Title: Text-to-Code Generation with Modality-relative Pre-training
Evaluation Result: Title: Text-to-Code Generation with Modality-relative Pre-training
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "programming language keywords (e.g. 'while') often have very strictly defined semantics. As such, transfer learning from their natural language usage may not necessarily be beneficial to their code application and vise versa."

Progress saved at index: 177


Processing papers:  35%|█████████████████████▏                                       | 178/511 [09:21<15:07,  2.73s/it]

Title: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images
Evaluation Result: Title: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "we examine potential gender and racial biases in such systems," and "we observe significant differences in the responses according to the perceived gender or race of the person depicted."

Progress saved at index: 178


Processing papers:  35%|█████████████████████▎                                       | 179/511 [09:24<15:45,  2.85s/it]

Title: TimeArena: Shaping Efficient Multitasking Language Agents in a Time-Aware Simulation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Our findings reveal that even the most powerful models, e.g., GPT-4, still lag behind humans in effective multitasking, underscoring the need for enhanced temporal awareness in the development of language agents."

Progress saved at index: 179


Processing papers:  35%|█████████████████████▍                                       | 180/511 [09:26<14:57,  2.71s/it]

Title: In-Context Learning Can Re-learn Forbidden Tasks
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Despite significant investment into safety training, large language models (LLMs) deployed in the real world still suffer from numerous vulnerabilities." and "we investigate whether ICL can undo safety training, which could represent a major security risk."

Progress saved at index: 180


Processing papers:  35%|█████████████████████▌                                       | 181/511 [09:30<17:02,  3.10s/it]

Title: Unified Speech-Text Pretraining for Spoken Dialog Modeling
Evaluation Result: Title: Unified Speech-Text Pretraining for Spoken Dialog Modeling
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "While recent work shows promising results in expanding the capabilities of large language models (LLM) to directly understand and synthesize speech, an LLM-based strategy for modeling spoken dialogs remains elusive and calls for further investigation."

Progress saved at index: 181


Processing papers:  36%|█████████████████████▋                                       | 182/511 [09:33<16:55,  3.09s/it]

Title: Self-Alignment of Large Language Models via Monopolylogue-based Social Scene Simulation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Aligning large language models (LLMs) with human values is imperative to mitigate potential adverse effects resulting from their misuse."

Progress saved at index: 182


Processing papers:  36%|█████████████████████▊                                       | 183/511 [09:36<15:38,  2.86s/it]

Title: Comprehensive Assessment of Jailbreak Attacks Against LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "safeguards have been taken to ensure that LLMs align with social ethics. However, recent findings have revealed an unsettling vulnerability bypassing the safeguards of LLMs, known as jailbreak attacks."

Progress saved at index: 183


Processing papers:  36%|█████████████████████▉                                       | 184/511 [09:40<17:47,  3.26s/it]

Title: Rocks Coding, Not Development--A Human-Centric, Experimental Evaluation of LLM-Supported SE Tasks
Evaluation Result: Title: Rocks Coding, Not Development--A Human-Centric, Experimental Evaluation of LLM-Supported SE Tasks

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We found that while ChatGPT performed well in solving simple coding problems, its performance in supporting typical software development tasks was not that good."

Progress saved at index: 184


Processing papers:  36%|██████████████████████                                       | 185/511 [09:43<16:47,  3.09s/it]

Title: Anfinsen Goes Neural: a Graphical Model for Conditional Antibody Design
Evaluation Result: Title: Anfinsen Goes Neural: a Graphical Model for Conditional Antibody Design
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "We also address a critical limitation of non-autoregressive models -- namely, that they tend to generate unrealistic sequences with overly repeating tokens."

Progress saved at index: 185


Processing papers:  36%|██████████████████████▏                                      | 186/511 [09:45<15:25,  2.85s/it]

Title: The Impact of AI Tool on Engineering at ANZ Bank An Emperical Study on GitHub Copilot within Coporate Environment
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "The increasing popularity of AI, particularly Large Language Models (LLMs), has significantly impacted various domains, including Software Engineering."

Progress saved at index: 186


Processing papers:  37%|██████████████████████▎                                      | 187/511 [09:47<14:22,  2.66s/it]

Title: Merging Facts, Crafting Fallacies: Evaluating the Contradictory Nature of Aggregated Factual Claims in Long-Form Generations
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We show that LLMs can generate paragraphs that contain verifiable facts, but the facts are combined to form a non-factual paragraph due to entity ambiguity." and "We also find that four widely used open-source LLMs tend to mix information of distinct entities to form non-factual paragraphs."

Progress saved at index: 187


Processing papers:  37%|██████████████████████▍                                      | 188/511 [09:50<14:07,  2.62s/it]

Title: Efficient Models for the Detection of Hate, Abuse and Profanity
Evaluation Result: Title: Efficient Models for the Detection of Hate, Abuse and Profanity
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Due to the LLMs being exposed to HAP content during training, the models learn it and may then generate hateful or profane content."

Progress saved at index: 188


Processing papers:  37%|██████████████████████▌                                      | 189/511 [09:53<15:03,  2.81s/it]

Title: AttnLRP: Attention-Aware Layer-wise Relevance Propagation for Transformers
Evaluation Result: Title: AttnLRP: Attention-Aware Layer-wise Relevance Propagation for Transformers

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Large Language Models are prone to biased predictions and hallucinations, underlining the paramount importance of understanding their model-internal reasoning process."

Progress saved at index: 189


Processing papers:  37%|██████████████████████▋                                      | 190/511 [09:55<14:24,  2.69s/it]

Title: Benchmarking Large Language Models on Communicative Medical Coaching: a Novel System and Dataset
Evaluation Result: Title: Benchmarking Large Language Models on Communicative Medical Coaching: a Novel System and Dataset

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "To construct the ChatCoach system, we developed a dataset and integrated Large Language Models such as ChatGPT and Llama2, aiming to assess their effectiveness in communicative medical coaching tasks."

Progress saved at index: 190


Processing papers:  37%|██████████████████████▊                                      | 191/511 [09:58<14:13,  2.67s/it]

Title: Can ChatGPT evaluate research quality?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "ChatGPT does not yet seem to be accurate enough to be trusted for any formal or informal research quality evaluation tasks."

Progress saved at index: 191


Processing papers:  38%|██████████████████████▉                                      | 192/511 [10:00<13:12,  2.48s/it]

Title: Question Aware Vision Transformer for Multimodal Reasoning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Despite their success, a critical limitation persists: the vision encoding process remains decoupled from user queries, often in the form of image-related questions."

Progress saved at index: 192


Processing papers:  38%|███████████████████████                                      | 193/511 [10:02<12:57,  2.45s/it]

Title: Rapid Optimization for Jailbreaking LLMs via Subconscious Exploitation and Echopraxia
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Despite their potential, recent research indicates aligned LLMs are prone to specialized jailbreaking prompts that bypass safety measures to elicit violent and harmful content. The intrinsic discrete nature and substantial scale of contemporary LLMs pose significant challenges in automatically generating diverse, efficient, and potent jailbreaking prompts, representing a continuous obstacle."

Progress saved at index: 193


Processing papers:  38%|███████████████████████▏                                     | 194/511 [10:05<13:18,  2.52s/it]

Title: It's Never Too Late: Fusing Acoustic Information into Large Language Models for Automatic Speech Recognition
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "GER introduces extra data uncertainty since the LLM is trained without taking into account acoustic information available in the speech signal."

Progress saved at index: 194


Processing papers:  38%|███████████████████████▎                                     | 195/511 [10:07<12:38,  2.40s/it]

Title: Large Language Models for Psycholinguistic Plausibility Pretesting
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We find that when coarse-grained plausibility judgements are needed, this works well, but when fine-grained judgements are necessary, even GPT-4 does not provide satisfactory discriminative power."

Progress saved at index: 195


Processing papers:  38%|███████████████████████▍                                     | 196/511 [10:10<12:51,  2.45s/it]

Title: Accurate LoRA-Finetuning Quantization of LLMs via Information Retention
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, existing methods cause the quantized LLM to severely degrade and even fail to benefit from the finetuning of LoRA."

Progress saved at index: 196


Processing papers:  39%|███████████████████████▌                                     | 197/511 [10:12<12:09,  2.32s/it]

Title: Do Large Code Models Understand Programming Concepts? A Black-box Approach
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Our findings suggest that current models lack understanding of concepts such as data flow and control flow."

Progress saved at index: 197


Processing papers:  39%|███████████████████████▋                                     | 198/511 [10:14<11:41,  2.24s/it]

Title: GPT-4 Generated Narratives of Life Events using a Structured Narrative Prompt: A Validation Study
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "All the ML models excelled at classifying valid narratives as valid, but experienced challenges at simultaneously classifying invalid narratives as invalid."

Progress saved at index: 198


Processing papers:  39%|███████████████████████▊                                     | 199/511 [10:16<11:58,  2.30s/it]

Title: Everybody Prune Now: Structured Pruning of LLMs with only Forward Passes
Evaluation Result: Title: Everybody Prune Now: Structured Pruning of LLMs with only Forward Passes

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "LLMs are becoming increasingly inaccessible as they grow in size. Whilst many approaches have been proposed to compress LLMs to make their resource consumption manageable, these methods themselves tend to be resource intensive, putting them out of the reach of the very user groups they target."

Progress saved at index: 199


Processing papers:  39%|███████████████████████▊                                     | 200/511 [10:19<12:37,  2.44s/it]

Title: In-Context Principle Learning from Mistakes
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Nonetheless, all ICL-based approaches only learn from correct input-output pairs."

Progress saved at index: 200


Processing papers:  39%|███████████████████████▉                                     | 201/511 [10:21<11:59,  2.32s/it]

Title: Enhancing Zero-shot Counting via Language-guided Exemplar Learning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "inheriting rich semantic priors from the prevailing pre-trained Large Language Models (LLMs)"

Progress saved at index: 201


Processing papers:  40%|████████████████████████                                     | 202/511 [10:23<11:39,  2.26s/it]

Title: CIC: A framework for Culturally-aware Image Captioning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, current methods lack the generation of detailed descriptive captions for the cultural elements depicted in the images, such as the traditional clothing worn by people from Asian cultural groups."

Progress saved at index: 202


Processing papers:  40%|████████████████████████▏                                    | 203/511 [10:25<11:43,  2.28s/it]

Title: Prompting with Divide-and-Conquer Program Makes Large Language Models Discerning to Hallucination and Deception
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "existing prompting strategies either suffers from insufficient expressive power or intermediate errors triggered by hallucination."

Progress saved at index: 203


Processing papers:  40%|████████████████████████▎                                    | 204/511 [10:27<11:05,  2.17s/it]

Title: Scaling Up LLM Reviews for Google Ads Content Moderation
Evaluation Result: Title: Scaling Up LLM Reviews for Google Ads Content Moderation

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Large language models (LLMs) are powerful tools for content moderation, but their inference costs and latency make them prohibitive for casual use on large datasets, such as the Google Ads repository."

Progress saved at index: 204


Processing papers:  40%|████████████████████████▍                                    | 205/511 [10:30<11:27,  2.25s/it]

Title: Using text embedding models and vector databases as text classifiers with the example of medical data
Evaluation Result: Title: Using text embedding models and vector databases as text classifiers with the example of medical data

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Using various LLMs to generate the medical data, we also understand the limitations of the medical knowledge of these models and encourage further expert medical review of our testing data."

Progress saved at index: 205


Processing papers:  40%|████████████████████████▌                                    | 206/511 [10:32<12:03,  2.37s/it]

Title: Are LLMs Ready for Real-World Materials Discovery?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "While LLMs have great potential to accelerate materials understanding and discovery, they currently fall short in being practical materials science tools. In this position paper, we show relevant failure cases of LLMs in materials science that reveal current limitations of LLMs related to comprehending and reasoning over complex, interconnected materials science knowledge."

Progress saved at index: 206


Processing papers:  41%|████████████████████████▋                                    | 207/511 [10:35<12:47,  2.52s/it]

Title: $λ$-ECLIPSE: Multi-Concept Personalized Text-to-Image Diffusion Models by Leveraging CLIP Latent Space
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Predominantly, contemporary approaches, involving the training of Hypernetworks and Multimodal Large Language Models (MLLMs), require heavy computing resources that range from 600 to 12300 GPU hours of training."

Progress saved at index: 207


Processing papers:  41%|████████████████████████▊                                    | 208/511 [10:38<13:27,  2.66s/it]

Title: InCoRo: In-Context Learning for Robotics Control with Feedback Loops
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Recent advances in LLMs have positioned them as go-to tools for simple reasoning tasks, motivating the pioneering work of Liang et al. [35] that uses an LLM to translate natural language commands into low-level static execution plans for robotic units."

Progress saved at index: 208


Processing papers:  41%|████████████████████████▉                                    | 209/511 [10:42<14:11,  2.82s/it]

Title: Opening the AI black box: program synthesis via mechanistic interpretability
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "As opposed to large language models, this program synthesis technique makes no use of (and is therefore not limited by) human training data such as algorithms and code from GitHub."

Progress saved at index: 209


Processing papers:  41%|█████████████████████████                                    | 210/511 [10:44<13:21,  2.66s/it]

Title: Hydragen: High-Throughput LLM Inference with Shared Prefixes
Evaluation Result: Title: Hydragen: High-Throughput LLM Inference with Shared Prefixes
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Transformer-based large language models (LLMs) are now deployed to hundreds of millions of users."

Progress saved at index: 210


Processing papers:  41%|█████████████████████████▏                                   | 211/511 [10:46<13:03,  2.61s/it]

Title: Language-Based Augmentation to Address Shortcut Learning in Object Goal Navigation
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention LLMs or any specific limitations related to them.

Progress saved at index: 211


Processing papers:  41%|█████████████████████████▎                                   | 212/511 [10:49<13:24,  2.69s/it]

Title: Assessing the Brittleness of Safety Alignment via Pruning and Low-Rank Modifications
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Large language models (LLMs) show inherent brittleness in their safety mechanisms, as evidenced by their susceptibility to jailbreaking and even non-malicious fine-tuning." and "These findings underscore the urgent need for more robust safety strategies in LLMs."

Progress saved at index: 212


Processing papers:  42%|█████████████████████████▍                                   | 213/511 [10:52<13:20,  2.69s/it]

Title: SALAD-Bench: A Hierarchical and Comprehensive Safety Benchmark for Large Language Models
Evaluation Result: **Title: SALAD-Bench: A Hierarchical and Comprehensive Safety Benchmark for Large Language Models**

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Our extensive experiments shed light on the resilience of LLMs against emerging threats and the efficacy of contemporary defense tactics."

Progress saved at index: 213


Processing papers:  42%|█████████████████████████▌                                   | 214/511 [10:54<12:45,  2.58s/it]

Title: A Sober Look at LLMs for Material Discovery: Are They Actually Good for Bayesian Optimization Over Molecules?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, existing work thus far has only explored LLMs for heuristic materials searches. Indeed, recent work obtains the uncertainty estimate -- an integral part of BO -- from point-estimated, non-Bayesian LLMs."

Progress saved at index: 214


Processing papers:  42%|█████████████████████████▋                                   | 215/511 [10:57<12:53,  2.61s/it]

Title: Pedagogical Alignment of Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "In this paper, we introduce the novel concept of pedagogically aligned Large Language Models (LLMs) that signifies a transformative shift in the application of LLMs within educational contexts."

Progress saved at index: 215


Processing papers:  42%|█████████████████████████▊                                   | 216/511 [10:59<12:36,  2.56s/it]

Title: An Enhanced Prompt-Based LLM Reasoning Scheme via Knowledge Graph-Integrated Collaboration
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "they encounter challenges in practical applications, including issues with hallucinations, inadequate knowledge updating, and limited transparency in the reasoning process."

Progress saved at index: 216


Processing papers:  42%|█████████████████████████▉                                   | 217/511 [11:02<12:01,  2.45s/it]

Title: ChatScratch: An AI-Augmented System Toward Autonomous Visual Programming Learning for Children Aged 6-12
Evaluation Result: Title: ChatScratch: An AI-Augmented System Toward Autonomous Visual Programming Learning for Children Aged 6-12

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "leverages Scratch-specialized Large Language Models (LLMs) for professional coding guidance."

---

Title: A Survey of Large Language Models in Healthcare: Applications, Challenges, and Future Directions
Paper: "Large Language Models (LLMs) have shown immense potential

Progress saved at index: 217


Processing papers:  43%|██████████████████████████                                   | 218/511 [11:05<12:58,  2.66s/it]

Title: Reconfidencing LLMs from the Grouping Loss Perspective
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Large Language Models (LLMs), including ChatGPT and LLaMA, are susceptible to generating hallucinated answers in a confident tone." and "Experiments show that they tend to be overconfident. Further, we show that they are more overconfident on some answers than others, \emph{eg} depending on the nationality of the person in the query."

Progress saved at index: 218


Processing papers:  43%|██████████████████████████▏                                  | 219/511 [11:08<13:52,  2.85s/it]

Title: Chatbots in Knowledge-Intensive Contexts: Comparing Intent and LLM-Based Systems
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, the additional degrees of freedom may have unforeseen consequences, especially in knowledge-intensive contexts where accuracy is crucial."

Progress saved at index: 219


Processing papers:  43%|██████████████████████████▎                                  | 220/511 [11:10<12:47,  2.64s/it]

Title: Prompting Implicit Discourse Relation Annotation
Evaluation Result: Title: Prompting Implicit Discourse Relation Annotation
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Nonetheless, ChatGPT's performance in the task of implicit discourse relation classification, prompted by a standard multiple-choice question, is still far from satisfactory and considerably inferior to state-of-the-art supervised approaches."

Progress saved at index: 220


Processing papers:  43%|██████████████████████████▍                                  | 221/511 [11:13<12:46,  2.64s/it]

Title: L4Q: Parameter Efficient Quantization-Aware Training on Large Language Models via LoRA-wise LSQ
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Post-training quantization (PTQ) and quantization-aware training (QAT) methods are gaining popularity in mitigating the high memory and computational costs associated with Large Language Models (LLMs)." and "However, these approaches may lack generality due to their reliance on the pre-quantized model's configuration. Their effectiveness may be compromised by non-linearly quantized or mixed-

Progress saved at index: 221


Processing papers:  43%|██████████████████████████▌                                  | 222/511 [11:16<13:15,  2.75s/it]

Title: Detecting Generated Native Ads in Conversational Search
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "In our experiments sentence transformers achieve detection precision and recall values above 0.9, while the investigated LLMs struggle with the task."

Progress saved at index: 222


Processing papers:  44%|██████████████████████████▌                                  | 223/511 [11:18<12:33,  2.61s/it]

Title: Guiding LLMs The Right Way: Fast, Non-Invasive Constrained Generation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "not only do such methods incur performance overhead during generation, but many of them also significantly impair task accuracy, if they do not correctly align the underlying LLM sub-word vocabularies with external constraints."

Progress saved at index: 223


Processing papers:  44%|██████████████████████████▋                                  | 224/511 [11:20<12:10,  2.54s/it]

Title: Long Is More for Alignment: A Simple but Tough-to-Beat Baseline for Instruction Fine-Tuning
Evaluation Result: Title: Long Is More for Alignment: A Simple but Tough-to-Beat Baseline for Instruction Fine-Tuning

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We also conduct a thorough analysis of our models to ensure that their enhanced performance is not simply due to GPT-4's preference for longer responses, thus ruling out any artificial improvement."

---

Title: Investigating Robustness in Large Language Models: A Comprehensive Evaluation Framework

Progress saved at index: 224


Processing papers:  44%|██████████████████████████▊                                  | 225/511 [11:23<12:05,  2.54s/it]

Title: Direct Language Model Alignment from Online AI Feedback
Evaluation Result: Title: Direct Language Model Alignment from Online AI Feedback
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "the preference datasets used in DAP methods are usually collected ahead of training and never updated, thus the feedback is purely offline. Moreover, responses in these datasets are often sampled from a language model distinct from the one being aligned, and since the model evolves over training, the alignment phase is inevitably off-policy."

Progress saved at index: 225


Processing papers:  44%|██████████████████████████▉                                  | 226/511 [11:26<13:25,  2.82s/it]

Title: MLLM-as-a-Judge: Assessing Multimodal LLM-as-a-Judge with Vision-Language Benchmark
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "MLLMs still face challenges in judgment, including diverse biases, hallucinatory responses, and inconsistencies, even for advanced models such as GPT-4V."

Progress saved at index: 226


Processing papers:  44%|███████████████████████████                                  | 227/511 [11:29<13:13,  2.79s/it]

Title: A Hypothesis-Driven Framework for the Analysis of Self-Rationalising Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "how faithful the explanations are to the predictions is questionable, raising the need to explore the patterns behind them further" and "The resulting models do not exhibit a strong similarity to GPT-3.5. We discuss the implications of this as well as the framework's potential to approximate LLM decisions better in future work."

Progress saved at index: 227


Processing papers:  45%|███████████████████████████▏                                 | 228/511 [11:33<15:02,  3.19s/it]

Title: ApiQ: Finetuning of 2-Bit Quantized Large Language Model
Evaluation Result: Title: ApiQ: Finetuning of 2-Bit Quantized Large Language Model

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "current strategies for memory-efficient finetuning, such as QLoRA, exhibit inconsistent performance across diverse bit-width quantizations and multifaceted tasks. This inconsistency largely stems from the detrimental impact of the quantization process on preserved knowledge, leading to catastrophic forgetting and undermining the utilization of

Progress saved at index: 228


Processing papers:  45%|███████████████████████████▎                                 | 229/511 [11:36<14:43,  3.13s/it]

Title: Large Language Models As Faithful Explainers
Evaluation Result: Title: Large Language Models As Faithful Explainers
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "natural language explanations are often criticized for lack of faithfulness since these explanations may not accurately reflect the decision-making behaviors of the LLMs."

Progress saved at index: 229


Processing papers:  45%|███████████████████████████▍                                 | 230/511 [11:39<13:34,  2.90s/it]

Title: LEVI: Generalizable Fine-tuning via Layer-wise Ensemble of Different Views
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "potential limitations in the pre-training data and models are often ignored."

Progress saved at index: 230


Processing papers:  45%|███████████████████████████▌                                 | 231/511 [11:41<12:17,  2.63s/it]

Title: The Future of Cognitive Strategy-enhanced Persuasive Dialogue Agents: New Perspectives and Trends
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Benefiting from the substantial progress of Large Language Models (LLMs), dialogue agents have acquired an exceptional capability in context understanding and response generation. However, as a typical and complicated cognitive psychological system, persuasive dialogue agents also require knowledge from the domain of cognitive psychology to attain a level of human-like persuasion."

Progress saved at index: 231


Processing papers:  45%|███████████████████████████▋                                 | 232/511 [11:44<12:52,  2.77s/it]

Title: SPARQL Generation: an analysis on fine-tuning OpenLLaMA for Question Answering over a Life Science Knowledge Graph
Evaluation Result: Title: SPARQL Generation: an analysis on fine-tuning OpenLLaMA for Question Answering over a Life Science Knowledge Graph

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "one of the main obstacles preventing their implementation is the scarcity of training data for the task of translating questions into corresponding SPARQL queries, particularly in the case of domain-specific KGs."

Progress saved at index: 232


Processing papers:  46%|███████████████████████████▊                                 | 233/511 [11:48<14:34,  3.14s/it]

Title: MEMORYLLM: Towards Self-Updatable Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Existing Large Language Models (LLMs) usually remain static after deployment, which might make it hard to inject new knowledge into the model."

Progress saved at index: 233


Processing papers:  46%|███████████████████████████▉                                 | 234/511 [11:50<13:09,  2.85s/it]

Title: CataractBot: An LLM-Powered Expert-in-the-Loop Chatbot for Cataract Patients
Evaluation Result: Title: CataractBot: An LLM-Powered Expert-in-the-Loop Chatbot for Cataract Patients
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "To address this gap, we propose CataractBot, an experts-in-the-loop chatbot powered by large language models (LLMs)."

Progress saved at index: 234


Processing papers:  46%|████████████████████████████                                 | 235/511 [11:53<13:08,  2.86s/it]

Title: InfLLM: Unveiling the Intrinsic Capacity of LLMs for Understanding Extremely Long Sequences with Training-Free Memory
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "existing LLMs, pre-trained on sequences with restricted maximum length, cannot generalize to longer sequences due to the out-of-domain and distraction issues."

Progress saved at index: 235


Processing papers:  46%|████████████████████████████▏                                | 236/511 [11:55<12:11,  2.66s/it]

Title: TinyLLM: Learning a Small Student from Multiple Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, existing methods suffer from several drawbacks, including limited knowledge diversity and the lack of rich contextual information."

Progress saved at index: 236


Processing papers:  46%|████████████████████████████▎                                | 237/511 [11:57<11:39,  2.55s/it]

Title: Faithfulness vs. Plausibility: On the (Un)Reliability of Explanations from Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.  
Rate Limitations of LLMs: 5.  
Evidence: "However, there is little to no understanding of their faithfulness," "we discuss the dichotomy between faithfulness and plausibility in SEs generated by LLMs," "these explanations do not necessarily align with the reasoning processes of the LLMs, raising concerns about their faithfulness," and "improving faithfulness is an open challenge."

Progress saved at index: 237


Processing papers:  47%|████████████████████████████▍                                | 238/511 [12:01<12:59,  2.86s/it]

Title: Improving Cross-Domain Low-Resource Text Generation through LLM Post-Editing: A Programmer-Interpreter Approach
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, relying solely on smaller language models for post-editing can limit the LLMs' ability to generalize across domains."

Progress saved at index: 238


Processing papers:  47%|████████████████████████████▌                                | 239/511 [12:03<12:13,  2.70s/it]

Title: Alirector: Alignment-Enhanced Chinese Grammatical Error Corrector
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Chinese grammatical error correction (CGEC) faces serious overcorrection challenges when employing autoregressive generative models such as sequence-to-sequence (Seq2Seq) models and decoder-only large language models (LLMs)."

Progress saved at index: 239


Processing papers:  47%|████████████████████████████▋                                | 240/511 [12:06<12:52,  2.85s/it]

Title: The Role of LLMs in Sustainable Smart Cities: Applications, Challenges, and Future Directions
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Our discourse culminates with an exploration of the formidable challenges that DL, FL, IoT, Blockchain, NLP, and LLMs face within these contexts, and we offer insights into potential future directions."

Progress saved at index: 240


Processing papers:  47%|████████████████████████████▊                                | 241/511 [12:09<12:52,  2.86s/it]

Title: Can Large Language Model Agents Simulate Human Trust Behaviors?
Evaluation Result: Title: Can Large Language Model Agents Simulate Human Trust Behaviors?
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We also explore the intrinsic properties of agent trust under conditions including advanced reasoning strategies and external manipulations."

Progress saved at index: 241


Processing papers:  47%|████████████████████████████▉                                | 242/511 [12:12<13:14,  2.96s/it]

Title: An Artificial Intelligence (AI) workflow for catalyst design and optimization
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "this study proposes an innovative Artificial Intelligence (AI) workflow that integrates Large Language Models (LLMs), Bayesian optimization, and an active learning loop to expedite and enhance catalyst optimization."

Progress saved at index: 242


Processing papers:  48%|█████████████████████████████                                | 243/511 [12:15<12:36,  2.82s/it]

Title: RA-Rec: An Efficient ID Representation Alignment Framework for LLM-based Recommendation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Current approaches generally fall into two main paradigms, the ID direct usage paradigm and the ID translation paradigm, noting their core weakness stems from lacking recommendation knowledge and uniqueness."

Progress saved at index: 243


Processing papers:  48%|█████████████████████████████▏                               | 244/511 [12:17<11:54,  2.68s/it]

Title: Online Cascade Learning for Efficient Inference over Streams
Evaluation Result: Title: Online Cascade Learning for Efficient Inference over Streams
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Large Language Models (LLMs) have a natural role in answering complex queries about data streams, but the high computational cost of LLM inference makes them infeasible in many such tasks."

Progress saved at index: 244


Processing papers:  48%|█████████████████████████████▏                               | 245/511 [12:20<12:14,  2.76s/it]

Title: The Fine-Grained Complexity of Gradient Computation for Training Large Language Models
Evaluation Result: Title: The Fine-Grained Complexity of Gradient Computation for Training Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "there is no truly sub-quadratic time algorithm in the remaining parameter regimes unless the popular hypothesis SETH is false."

Progress saved at index: 245


Processing papers:  48%|█████████████████████████████▎                               | 246/511 [12:23<11:59,  2.71s/it]

Title: Grandmaster-Level Chess Without Search
Evaluation Result: Title: Grandmaster-Level Chess Without Search
Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The paper focuses on a transformer model trained for chess and does not discuss language models.

Progress saved at index: 246


Processing papers:  48%|█████████████████████████████▍                               | 247/511 [12:25<11:28,  2.61s/it]

Title: De-amplifying Bias from Differential Privacy in Language Model Fine-tuning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We show that DP amplifies gender, racial, and religious bias when fine-tuning large language models (LLMs), producing models more biased than ones fine-tuned without DP."

Progress saved at index: 247


Processing papers:  49%|█████████████████████████████▌                               | 248/511 [12:27<10:55,  2.49s/it]

Title: Unmasking the Shadows of AI: Investigating Deceptive Capabilities in Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "concentrating on deceptive behaviours of Large Language Models (LLMs)" and "emphasising multidimensional biases that underlie their deceptive behaviours" and "the literature review covers four types of deception categorised: Strategic deception, Imitation, Sycophancy, and Unfaithful Reasoning, along with the social implications and risks they entail."

Progress saved at index: 248


Processing papers:  49%|█████████████████████████████▋                               | 249/511 [12:30<11:15,  2.58s/it]

Title: Detecting Mode Collapse in Language Models via Narration
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "we show successive versions of GPT-3 suffer from increasing degrees of 'mode collapse' whereby overfitting the model during alignment constrains it from generalizing over authorship: models suffering from mode collapse become unable to assume a multiplicity of perspectives."

Progress saved at index: 249


Processing papers:  49%|█████████████████████████████▊                               | 250/511 [12:33<11:09,  2.56s/it]

Title: Structured Entity Extraction Using Large Language Models
Evaluation Result: Title: Structured Entity Extraction Using Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "This paper explores the challenges and limitations of current methodologies in structured entity extraction and introduces a novel approach to address these issues."

Progress saved at index: 250


Processing papers:  49%|█████████████████████████████▉                               | 251/511 [12:35<10:39,  2.46s/it]

Title: Chatbot Meets Pipeline: Augment Large Language Model with Definite Finite Automaton
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Traditional LLMs face challenges in generating regulated and compliant responses in special scenarios with predetermined response guidelines, like emotional support and customer service."

Progress saved at index: 251


Processing papers:  49%|██████████████████████████████                               | 252/511 [12:38<11:00,  2.55s/it]

Title: Democratizing Large Language Models via Personalized Parameter-Efficient Fine-tuning
Evaluation Result: Title: Democratizing Large Language Models via Personalized Parameter-Efficient Fine-tuning
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, these approaches were limited due to a lack of model ownership, resulting in constrained customization and privacy issues. Moreover, they often failed to accurately capture user behavior patterns, especially in cases where user data were complex and dynamic."

Progress saved at index: 252


Processing papers:  50%|██████████████████████████████▏                              | 253/511 [12:41<12:12,  2.84s/it]

Title: Fine-Tuned Language Models Generate Stable Inorganic Materials as Text
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We propose fine-tuning large language models for generation of stable materials."

Progress saved at index: 253


Processing papers:  50%|██████████████████████████████▎                              | 254/511 [12:44<12:04,  2.82s/it]

Title: Monitoring the evolution of antisemitic discourse on extremist social media using BERT
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "we created an unsupervised online machine learning approach that uses large language models to assess the contextual similarity of posts."

Progress saved at index: 254


Processing papers:  50%|██████████████████████████████▍                              | 255/511 [12:47<12:48,  3.00s/it]

Title: The World of Generative AI: Deepfakes and Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "LLMs are powerful language models that generate general-purpose language. However due to its generative aspect, it can also be a risk for people if used with ill intentions."

Progress saved at index: 255


Processing papers:  50%|██████████████████████████████▌                              | 256/511 [12:50<12:46,  3.01s/it]

Title: Tag-LLM: Repurposing General-Purpose LLMs for Specialized Domains
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, their capabilities wane in highly specialized domains underrepresented in the pretraining corpus, such as physical and biomedical sciences."

Progress saved at index: 256


Processing papers:  50%|██████████████████████████████▋                              | 257/511 [12:53<12:12,  2.88s/it]

Title: The Hedgehog & the Porcupine: Expressive Linear Attentions with Softmax Mimicry
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, linear attentions often underperform standard softmax attention in quality."

Progress saved at index: 257


Processing papers:  50%|██████████████████████████████▊                              | 258/511 [12:55<11:06,  2.63s/it]

Title: LESS: Selecting Influential Data for Targeted Instruction Tuning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "The challenge lies in identifying the most relevant data from these extensive datasets to effectively develop specific capabilities, a setting we frame as targeted instruction tuning."

Progress saved at index: 258


Processing papers:  51%|██████████████████████████████▉                              | 259/511 [12:58<11:10,  2.66s/it]

Title: Training Language Models to Generate Text with Citations via Fine-grained Rewards
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "While recent Large Language Models (LLMs) have proven useful in answering user queries, they are prone to hallucination, and their responses often lack credibility due to missing references to reliable sources."

Progress saved at index: 259


Processing papers:  51%|███████████████████████████████                              | 260/511 [13:01<11:07,  2.66s/it]

Title: HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Automated red teaming holds substantial promise for uncovering and mitigating the risks associated with the malicious use of large language models (LLMs)," and "We identify several desirable properties previously unaccounted for in red teaming evaluations."

Progress saved at index: 260


Processing papers:  51%|███████████████████████████████▏                             | 261/511 [13:03<10:59,  2.64s/it]

Title: Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "While their capabilities are promising, they also introduce novel vulnerabilities that demand careful consideration for safety." and "We begin by providing a comprehensive overview of the potential risks inherent to scientific LLM agents," and "Furthermore, we highlight the limitations and challenges associated with safeguarding scientific agents."

Progress saved at index: 261


Processing papers:  51%|███████████████████████████████▎                             | 262/511 [13:06<10:54,  2.63s/it]

Title: Can Generative Agents Predict Emotion?
Evaluation Result: Title: Can Generative Agents Predict Emotion?
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "The mixed results suggests that introducing context can occasionally improve the emotional alignment of the agent, but further study and comparison with human evaluators is necessary."

Progress saved at index: 262


Processing papers:  51%|███████████████████████████████▍                             | 263/511 [13:08<10:41,  2.59s/it]

Title: Scaling Laws for Downstream Task Performance of Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, there are also cases where moderate misalignment causes the BLEU score to fluctuate or get worse with more pretraining, whereas downstream cross-entropy monotonically improves."

Progress saved at index: 263


Processing papers:  52%|███████████████████████████████▌                             | 264/511 [13:11<10:45,  2.61s/it]

Title: Harnessing the Plug-and-Play Controller by Prompting
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Previous approaches, such as plug-and-play controllers (PPCs), aimed to steer the properties of generated text in a flexible manner. However, these methods often compromised the integrity of the language model's decoding process, resulting in less smooth text generation."

Progress saved at index: 264


Processing papers:  52%|███████████████████████████████▋                             | 265/511 [13:13<10:36,  2.59s/it]

Title: Multi-line AI-assisted Code Authoring
Evaluation Result: **Title: Multi-line AI-assisted Code Authoring**

Does it talk about LLMs: Yes.

Rate Limitations of LLMs: 3.

Evidence: "First, we discuss how multi-line suggestions can have a 'jarring' effect, as the LLM's suggestions constantly move around the developer's existing code, which would otherwise result in decreased productivity and satisfaction." and "multi-line suggestions take significantly longer to generate; hence we present several innovative investments we made to reduce the perceived

Progress saved at index: 265


Processing papers:  52%|███████████████████████████████▊                             | 266/511 [13:16<10:56,  2.68s/it]

Title: Advancing Legal Reasoning: The Integration of AI to Navigate Complexities and Biases in Global Jurisprudence with Semi-Automated Arbitration Processes (SAAPs)
Evaluation Result: Title: Advancing Legal Reasoning: The Integration of AI to Navigate Complexities and Biases in Global Jurisprudence with Semi-Automated Arbitration Processes (SAAPs)
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "By incorporating Advanced Language Models (ALMs) and a newly introduced human-AI collaborative framework, this paper seeks to analyze Grounded Theory-based research design with Advanced Language Models (ALMs)

Progress saved at index: 266


Processing papers:  52%|███████████████████████████████▊                             | 267/511 [13:20<11:41,  2.87s/it]

Title: Scientific Language Modeling: A Quantitative Review of Large Language Models in Molecular Science
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "two key issues remain: how to quantify the match between model and data modalities and how to identify the knowledge-learning preferences of models."

Progress saved at index: 267


Processing papers:  52%|███████████████████████████████▉                             | 268/511 [13:22<11:32,  2.85s/it]

Title: Measuring Implicit Bias in Explicitly Unbiased Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Large language models (LLMs) can pass explicit bias tests but still harbor implicit biases," and "Using these measures, we found pervasive human-like stereotype biases in 6 LLMs across 4 social domains (race, gender, religion, health) and 21 categories (weapons, guilt, science, career among others)."

Progress saved at index: 268


Processing papers:  53%|████████████████████████████████                             | 269/511 [13:26<12:56,  3.21s/it]

Title: The Use of a Large Language Model for Cyberbullying Detection
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, their performances are not consistent due to high class imbalance and generalisation issues."

Progress saved at index: 269


Processing papers:  53%|████████████████████████████████▏                            | 270/511 [13:29<11:52,  2.96s/it]

Title: Provably learning a multi-head attention layer
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We focus on Boolean $\mathbf{X}$ to mimic the discrete nature of tokens in large language models, though our techniques naturally extend to standard continuous settings, e.g. Gaussian."

Progress saved at index: 270


Processing papers:  53%|████████████████████████████████▎                            | 271/511 [13:32<12:01,  3.01s/it]

Title: Systematic Biases in LLM Simulations of Debates
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "In this study, we highlight the limitations of LLMs in simulating human interactions, particularly focusing on LLMs' ability to simulate political debates. Our findings indicate a tendency for LLM agents to conform to the model's inherent social biases despite being directed to debate from certain political perspectives."

Progress saved at index: 271


Processing papers:  53%|████████████████████████████████▍                            | 272/511 [13:35<11:52,  2.98s/it]

Title: LLM Agents can Autonomously Hack Websites
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Our findings raise questions about the widespread deployment of LLMs."

Progress saved at index: 272


Processing papers:  53%|████████████████████████████████▌                            | 273/511 [13:37<10:45,  2.71s/it]

Title: Understanding the Effect of Noise in LLM Training Data with Algorithmic Chains of Thought
Evaluation Result: Title: Understanding the Effect of Noise in LLM Training Data with Algorithmic Chains of Thought

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We find fine-tuned models are extremely robust to high levels of static noise but struggle significantly more with lower levels of dynamic noise."

Progress saved at index: 273


Processing papers:  54%|████████████████████████████████▋                            | 274/511 [13:40<10:46,  2.73s/it]

Title: Enhancing Retrieval Processes for Language Generation with Augmented Queries
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "These models sometimes face difficulties, like providing inaccurate information, commonly known as 'hallucination.'"

Progress saved at index: 274


Processing papers:  54%|████████████████████████████████▊                            | 275/511 [13:42<10:19,  2.62s/it]

Title: LV-Eval: A Balanced Long-Context Benchmark with 5 Length Levels Up to 256K
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Issues related to knowledge leakage and inaccurate metrics introduce bias in evaluation," and "LLMs' performances can significantly degrade in the presence of confusing information, especially in the pressure test of 'needle in a haystack'."

Progress saved at index: 275


Processing papers:  54%|████████████████████████████████▉                            | 276/511 [13:45<10:25,  2.66s/it]

Title: Discovery of the Hidden World with Large Language Models
Evaluation Result: Title: Discovery of the Hidden World with Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "The rise of large language models (LLMs) that are trained to learn rich knowledge from the massive observations of the world, provides a new opportunity to assist with discovering high-level hidden variables from the raw observational data."

Progress saved at index: 276


Processing papers:  54%|█████████████████████████████████                            | 277/511 [13:47<10:21,  2.66s/it]

Title: Leak, Cheat, Repeat: Data Contamination and Evaluation Malpractices in Closed-Source LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "The lack of access to model details, especially regarding training data, has repeatedly raised concerns about data contamination among researchers." and "we document a number of evaluation malpractices emerging in the reviewed papers, such as unfair or missing baseline comparisons and reproducibility issues."

Progress saved at index: 277


Processing papers:  54%|█████████████████████████████████▏                           | 278/511 [13:50<10:15,  2.64s/it]

Title: Large Language Models to Enhance Bayesian Optimization
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "we present LLAMBO, a novel approach that integrates the capabilities of Large Language Models (LLM) within BO."

Progress saved at index: 278


Processing papers:  55%|█████████████████████████████████▎                           | 279/511 [13:54<11:22,  2.94s/it]

Title: Can Large Language Models Detect Rumors on Social Media?
Evaluation Result: Title: Can Large Language Models Detect Rumors on Social Media?
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "it is challenging for LLMs to reason over the entire propagation information on social media, which contains news contents and numerous comments, due to LLMs may not concentrate on key clues in the complex propagation information, and have trouble in reasoning when facing massive and redundant information."

Progress saved at index: 279


Processing papers:  55%|█████████████████████████████████▍                           | 280/511 [13:57<11:19,  2.94s/it]

Title: Embedding Large Language Models into Extended Reality: Opportunities and Challenges for Inclusion, Engagement, and Privacy
Evaluation Result: Title: Embedding Large Language Models into Extended Reality: Opportunities and Challenges for Inclusion, Engagement, and Privacy
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Lastly, we speculate that combining the information provided to LLM-powered environments by the users and the biometric data obtained through the sensors might lead to novel privacy invasions. While studying such possible privacy invasions, user privacy concerns and preferences should also be investigated."

Progress saved at index: 280


Processing papers:  55%|█████████████████████████████████▌                           | 281/511 [14:00<11:16,  2.94s/it]

Title: DistiLLM: Towards Streamlined Distillation for Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "current KD methods for auto-regressive sequence models (e.g., large language models) suffer from missing a standardized objective function. Moreover, the recent use of student-generated outputs to address training-inference mismatches has significantly escalated computational costs."

Progress saved at index: 281


Processing papers:  55%|█████████████████████████████████▋                           | 282/511 [14:02<11:02,  2.89s/it]

Title: Beyond Lines and Circles: Unveiling the Geometric Reasoning Gap in Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Our work reveals notable challenges that the state-of-the-art LLMs face in this domain despite many successes in similar areas. LLMs exhibit biases in target variable selection and struggle with 2D spatial relationships, often misrepresenting and hallucinating objects and their placements."

Progress saved at index: 282


Processing papers:  55%|█████████████████████████████████▊                           | 283/511 [14:05<10:30,  2.76s/it]

Title: ANLS* -- A Universal Document Processing Metric for Generative Large Language Models
Evaluation Result: Title: ANLS* -- A Universal Document Processing Metric for Generative Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, evaluating GLLMs presents a challenge as the binary true or false evaluation used for discriminative models is not applicable to the predictions made by GLLMs."

Progress saved at index: 283


Processing papers:  56%|█████████████████████████████████▉                           | 284/511 [14:07<10:09,  2.68s/it]

Title: BiLLM: Pushing the Limit of Post-Training Quantization for LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Pretrained large language models (LLMs) exhibit exceptional general language processing capabilities but come with significant demands on memory and computational resources." and "existing quantization techniques fall short of maintaining LLM performance under ultra-low bit-widths."

Progress saved at index: 284


Processing papers:  56%|██████████████████████████████████                           | 285/511 [14:10<09:48,  2.60s/it]

Title: Rethinking Skill Extraction in the Job Market Domain using Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, the reliance on manually annotated data limits the generalizability of such approaches. Moreover, the common BIO setting limits the ability of the models to capture complex skill patterns and handle ambiguous mentions."

Progress saved at index: 285


Processing papers:  56%|██████████████████████████████████▏                          | 286/511 [14:12<09:49,  2.62s/it]

Title: RevOrder: A Novel Method for Enhanced Arithmetic in Language Models
Evaluation Result: Title: RevOrder: A Novel Method for Enhanced Arithmetic in Language Models
Paper: This paper presents RevOrder, a novel technique aimed at improving arithmetic operations in large language models (LLMs) by reversing the output digits in addition, subtraction, and n-digit by 1-digit (nD by 1D) multiplication tasks. Our method significantly reduces the Count of Sequential Intermediate Digits (CSID) to $\mathcal{O}(1)$, a new metric we introduce to assess equation

Progress saved at index: 286


Processing papers:  56%|██████████████████████████████████▎                          | 287/511 [14:15<10:05,  2.70s/it]

Title: ReLU$^2$ Wins: Discovering Efficient Activation Functions for Sparse LLMs
Evaluation Result: Title: ReLU$^2$ Wins: Discovering Efficient Activation Functions for Sparse LLMs

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Sparse computation offers a compelling solution for the inference of Large Language Models (LLMs) in low-resource scenarios by dynamically skipping the computation of inactive neurons."

Progress saved at index: 287


Processing papers:  56%|██████████████████████████████████▍                          | 288/511 [14:18<10:10,  2.74s/it]

Title: Exploring Low-Resource Medical Image Classification with Weakly Supervised Prompt Learning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "existing pre-trained vision-language models require domain experts to carefully design the medical prompts, which greatly increases the burden on clinicians."

Progress saved at index: 288


Processing papers:  57%|██████████████████████████████████▍                          | 289/511 [14:20<09:24,  2.54s/it]

Title: MolTC: Towards Molecular Relational Modeling In Language Models
Evaluation Result: **Title: MolTC: Towards Molecular Relational Modeling In Language Models**

Does it talk about LLMs: Yes.

Rate Limitations of LLMs: 3.

Evidence: "Despite their potential, these methods predominantly rely on the textual data, thus not fully harnessing the wealth of structural information inherent in molecular graphs."

Progress saved at index: 289


Processing papers:  57%|██████████████████████████████████▌                          | 290/511 [14:23<09:18,  2.53s/it]

Title: Large Language Models As MOOCs Graders
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, the History and Philosophy of Astronomy course proves to be more challenging in terms of grading as opposed to other courses."

Progress saved at index: 290


Processing papers:  57%|██████████████████████████████████▋                          | 291/511 [14:25<09:28,  2.58s/it]

Title: The Instinctive Bias: Spurious Images lead to Hallucination in MLLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "those powerful MLLMs such as GPT-4V still fail spectacularly when presented with certain image and text inputs." and "illustrating that they universally suffer from this instinctive bias to varying degrees."

Progress saved at index: 291


Processing papers:  57%|██████████████████████████████████▊                          | 292/511 [14:28<09:09,  2.51s/it]

Title: INSIDE: LLMs' Internal States Retain the Power of Hallucination Detection
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Knowledge hallucination have raised widespread concerns for the security and reliability of deployed LLMs."

Progress saved at index: 292


Processing papers:  57%|██████████████████████████████████▉                          | 293/511 [14:30<08:32,  2.35s/it]

Title: Similarity-based Neighbor Selection for Graph LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Prior research in this field has grappled with issues such as over-squashing, heterophily, and ineffective graph information integration, further compounded by inconsistencies in dataset partitioning and underutilization of advanced LLMs."

Progress saved at index: 293


Processing papers:  58%|███████████████████████████████████                          | 294/511 [14:32<08:26,  2.33s/it]

Title: Automatic Robotic Development through Collaborative Framework by Large Language Models
Evaluation Result: Title: Automatic Robotic Development through Collaborative Framework by Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Despite the remarkable code generation abilities of large language models LLMs, they still face challenges in complex task handling."

Progress saved at index: 294


Processing papers:  58%|███████████████████████████████████▏                         | 295/511 [14:34<08:12,  2.28s/it]

Title: Personalized Language Modeling from Personalized Human Feedback
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, the underlying premise of algorithms developed under this framework can be problematic when user preferences encoded in human feedback are diverse." and "explain why vanilla RLHF can be problematic in this context."

Progress saved at index: 295


Processing papers:  58%|███████████████████████████████████▎                         | 296/511 [14:36<08:02,  2.25s/it]

Title: Large Language Models as an Indirect Reasoner: Contrapositive and Contradiction for Automated Reasoning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "previous methods, such as Chain-of-Thought and Self-Consistency, mainly follow Direct Reasoning (DR) frameworks, so they will meet difficulty in solving numerous real-world tasks which can hardly be solved via DR."

Progress saved at index: 296


Processing papers:  58%|███████████████████████████████████▍                         | 297/511 [14:39<08:05,  2.27s/it]

Title: Limits of Large Language Models in Debating Humans
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We find that LLMs can blend in and facilitate human productivity but are less convincing in debate, with their behavior ultimately deviating from human's. We elucidate these primary failings and anticipate that LLMs must evolve further before being viable debaters."

Progress saved at index: 297


Processing papers:  58%|███████████████████████████████████▌                         | 298/511 [14:42<09:22,  2.64s/it]

Title: Learning to Generate Explainable Stock Predictions using Self-Reflective Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "the task of stock prediction remains challenging for LLMs, as it requires the ability to weigh the varying impacts of chaotic social texts on stock prices. The problem gets progressively harder with the introduction of the explanation component, which requires LLMs to explain verbally why certain factors are more important than the others."

Progress saved at index: 298


Processing papers:  59%|███████████████████████████████████▋                         | 299/511 [14:46<10:25,  2.95s/it]

Title: Sentiment-enhanced Graph-based Sarcasm Explanation in Dialogue
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Although existing studies have achieved great success based on the generative pretrained language model BART, they overlook exploiting the sentiments residing in the utterance, video and audio, which are vital clues for sarcasm explanation."

Progress saved at index: 299


Processing papers:  59%|███████████████████████████████████▊                         | 300/511 [14:48<09:58,  2.84s/it]

Title: Enhancing LLM-Based Coding Tools through Native Integration of IDE-Derived Static Context
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Being trained on in-file contexts, current LLMs are quite effective in completing code for single source files. However, it is challenging for them to conduct repository-level code completion for large software projects that require cross-file information. Existing research on LLM-based repository-level code completion identifies and integrates cross-file contexts, but it suffers from low accuracy and limited context length of LLMs

Progress saved at index: 300


Processing papers:  59%|███████████████████████████████████▉                         | 301/511 [14:51<09:42,  2.77s/it]

Title: Partially Recentralization Softmax Loss for Vision-Language Models Robustness
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "it has been shown that multimodal NLP are vulnerable to adversarial attacks, where the outputs of a model can be dramatically changed by a perturbation to the input."

Progress saved at index: 301


Processing papers:  59%|████████████████████████████████████                         | 302/511 [14:54<09:26,  2.71s/it]

Title: Self-Discover: Large Language Models Self-Compose Reasoning Structures
Evaluation Result: Title: Self-Discover: Large Language Models Self-Compose Reasoning Structures
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We introduce SELF-DISCOVER, a general framework for LLMs to self-discover the task-intrinsic reasoning structures to tackle complex reasoning problems that are challenging for typical prompting methods."

Progress saved at index: 302


Processing papers:  59%|████████████████████████████████████▏                        | 303/511 [14:56<09:30,  2.74s/it]

Title: Improving Contextual Congruence Across Modalities for Effective Multimodal Marketing using Knowledge-infused Learning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "large Language (LLMs) and Vision models (LVMs) are still limited in capturing holistic meaning with cross-modal semantic relationships."

Progress saved at index: 303


Processing papers:  59%|████████████████████████████████████▎                        | 304/511 [14:59<09:06,  2.64s/it]

Title: Distinguishing the Knowable from the Unknowable with Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "We study the feasibility of identifying epistemic uncertainty (reflecting a lack of knowledge), as opposed to aleatoric uncertainty (reflecting entropy in the underlying distribution), in the outputs of large language models (LLMs) over free-form text."

Progress saved at index: 304


Processing papers:  60%|████████████████████████████████████▍                        | 305/511 [15:02<09:24,  2.74s/it]

Title: Evaluating the Factuality of Zero-shot Summarizers Across Varied Domains
Evaluation Result: Title: Evaluating the Factuality of Zero-shot Summarizers Across Varied Domains
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We acquire annotations from domain experts to identify inconsistencies in summaries and systematically categorize these errors."

Progress saved at index: 305


Processing papers:  60%|████████████████████████████████████▌                        | 306/511 [15:04<08:59,  2.63s/it]

Title: Neural networks for abstraction and reasoning: Towards broad generalization in machines
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "LLMs are able to solve a different group of problems to state-of-the-art solvers, and provide an interesting way to complement other approaches."

Progress saved at index: 306


Processing papers:  60%|████████████████████████████████████▋                        | 307/511 [15:06<08:35,  2.53s/it]

Title: Beyond Text: Improving LLM's Decision Making for Robot Navigation via Vocal Cues
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "This work highlights a critical shortcoming in text-based Large Language Models (LLMs) used for human-robot interaction, demonstrating that text alone as a conversation modality falls short in such applications. While LLMs excel in processing text in these human conversations, they struggle with the nuances of verbal instructions in scenarios like social navigation, where ambiguity and uncertainty can erode trust in robotic and

Progress saved at index: 307


Processing papers:  60%|████████████████████████████████████▊                        | 308/511 [15:10<09:38,  2.85s/it]

Title: A Systematic Survey of Prompt Engineering in Large Language Models: Techniques and Applications
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "We also delve into the strengths and limitations of each approach and include a taxonomy diagram and table summarizing datasets, models, and critical points of each prompting technique."

Progress saved at index: 308


Processing papers:  60%|████████████████████████████████████▉                        | 309/511 [15:28<24:42,  7.34s/it]

Title: Arabic Synonym BERT-based Adversarial Examples for Text Classification
Evaluation Result: Title: Arabic Synonym BERT-based Adversarial Examples for Text Classification
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "We find that fine-tuned BERT models were more susceptible to our synonym attacks than the other Deep Neural Networks (DNN) models like WordCNN and WordLSTM we trained."

Progress saved at index: 309


Processing papers:  61%|█████████████████████████████████████                        | 310/511 [15:30<19:46,  5.90s/it]

Title: Nevermind: Instruction Override and Moderation in Large Language Models
Evaluation Result: Title: Nevermind: Instruction Override and Moderation in Large Language Models

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Finally, we observe improving instruction following, and subsequently instruction overrides/jailbreaks, is fundamentally at odds with the ability of a language model to follow given safety filters or guidelines."

Progress saved at index: 310


Processing papers:  61%|█████████████████████████████████████▏                       | 311/511 [15:33<16:43,  5.02s/it]

Title: DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models
Evaluation Result: Title: DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature."

Progress saved at index: 311


Processing papers:  61%|█████████████████████████████████████▏                       | 312/511 [15:36<14:19,  4.32s/it]

Title: GUARD: Role-playing to Generate Natural-language Jailbreakings to Test Guideline Adherence of Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "The discovery of 'jailbreaks' to bypass safety filters of Large Language Models (LLMs) and harmful responses have encouraged the community to implement safety measures." and "Our system of different roles will leverage this knowledge graph to generate new jailbreaks, which have proved effective in inducing LLMs to generate unethical or guideline-violating responses."

Progress saved at index: 312


Processing papers:  61%|█████████████████████████████████████▎                       | 313/511 [15:39<12:54,  3.91s/it]

Title: Make Every Move Count: LLM-based High-Quality RTL Code Generation Using MCTS
Evaluation Result: Title: Make Every Move Count: LLM-based High-Quality RTL Code Generation Using MCTS

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Existing large language models (LLMs) for register transfer level code generation face challenges like compilation failures and suboptimal power, performance, and area (PPA) efficiency."

Progress saved at index: 313


Processing papers:  61%|█████████████████████████████████████▍                       | 314/511 [15:42<11:32,  3.51s/it]

Title: Deal, or no deal (or who knows)? Forecasting Uncertainty in Conversations using Large Language Models
Evaluation Result: Title: Deal, or no deal (or who knows)? Forecasting Uncertainty in Conversations using Large Language Models

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "How well can language models represent inherent uncertainty in conversations?" and "Experiments on eight difficult negotiation corpora demonstrate that our proposed fine-tuning strategies... can calibrate smaller open-source models to compete with pre-trained models 10x their size."

Progress saved at index: 314


Processing papers:  62%|█████████████████████████████████████▌                       | 315/511 [15:45<10:59,  3.37s/it]

Title: A Framework for Partially Observed Reward-States in RLHF
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Unfortunately current models of RLHF do not take take this into consideration. Moreover most RLHF models do not account for intermediate feedback, which is gaining importance in empirical work and can help improve both sample complexity and alignment."

Progress saved at index: 315


Processing papers:  62%|█████████████████████████████████████▋                       | 316/511 [15:47<10:11,  3.13s/it]

Title: MobilityGPT: Enhanced Human Mobility Modeling with a GPT model
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Generative models have shown promising results in capturing human mobility characteristics and generating synthetic trajectories. However, it remains challenging to ensure that the generated geospatial mobility data is semantically realistic, including consistent location sequences, and reflects real-world characteristics, such as constraining on geospatial limits."

Progress saved at index: 316


Processing papers:  62%|█████████████████████████████████████▊                       | 317/511 [15:50<09:49,  3.04s/it]

Title: English Prompts are Better for NLI-based Zero-Shot Emotion Classification than Target-Language Prompts
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Our experiments with natural language inference-based language models show that it is consistently better to use English prompts even if the data is in a different language."

Progress saved at index: 317


Processing papers:  62%|█████████████████████████████████████▉                       | 318/511 [15:52<08:52,  2.76s/it]

Title: Unified Hallucination Detection for Multimodal Large Language Models
Evaluation Result: Title: Unified Hallucination Detection for Multimodal Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Multimodal Large Language Models (MLLMs) are plagued by the critical issue of hallucination."

Progress saved at index: 318


Processing papers:  62%|██████████████████████████████████████                       | 319/511 [15:54<08:23,  2.62s/it]

Title: LB-KBQA: Large-language-model and BERT based Knowledge-Based Question and Answering System
Evaluation Result: Title: LB-KBQA: Large-language-model and BERT based Knowledge-Based Question and Answering System
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "One of the typical application fields of Generative AI is large language models (LLMs), and the natural language understanding capability of LLM is dramatically improved when compared with conventional AI-based methods."

Progress saved at index: 319


Processing papers:  63%|██████████████████████████████████████▏                      | 320/511 [15:58<09:06,  2.86s/it]

Title: Empowering Time Series Analysis with Large Language Models: A Survey
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "completely training a large general-purpose model from the scratch is challenging for time series analysis, due to the large volumes and varieties of time series data, as well as the non-stationarity that leads to concept drift impeding continuous model adaptation and re-training."

Progress saved at index: 320


Processing papers:  63%|██████████████████████████████████████▎                      | 321/511 [16:00<08:43,  2.76s/it]

Title: C-RAG: Certified Generation Risks for Retrieval-Augmented Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Despite the impressive capabilities of large language models (LLMs) across diverse applications, they still suffer from trustworthiness issues, such as hallucinations and misalignments."

Progress saved at index: 321


Processing papers:  63%|██████████████████████████████████████▍                      | 322/511 [16:03<08:25,  2.67s/it]

Title: CIDAR: Culturally Relevant Instruction Dataset For Arabic
Evaluation Result: Title: CIDAR: Culturally Relevant Instruction Dataset For Arabic
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "current instruction datasets predominantly cater to English or are derived from English-dominated LLMs, resulting in inherent biases toward Western culture."

Progress saved at index: 322


Processing papers:  63%|██████████████████████████████████████▌                      | 323/511 [16:05<08:04,  2.58s/it]

Title: The Matrix: A Bayesian learning model for LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We explore the optimization metric of LLMs, which is based on predicting the next token, and develop a novel model grounded in this principle."

Progress saved at index: 323


Processing papers:  63%|██████████████████████████████████████▋                      | 324/511 [16:07<07:39,  2.46s/it]

Title: MULTI: Multimodal Understanding Leaderboard with Text and Images
Evaluation Result: Title: MULTI: Multimodal Understanding Leaderboard with Text and Images

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "existing benchmarks primarily focus on understanding simple natural images and short context" and "Our evaluation indicates significant potential for MLLM advancement, with GPT-4V achieving a 63.7% accuracy rate on MULTI, in contrast to other MLLMs scoring between 28.5% and

Progress saved at index: 324


Processing papers:  64%|██████████████████████████████████████▊                      | 325/511 [16:10<07:53,  2.55s/it]

Title: Homograph Attacks on Maghreb Sentiment Analyzers
Evaluation Result: Title: Homograph Attacks on Maghreb Sentiment Analyzers
Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention language models (LLMs) explicitly.

Progress saved at index: 325


Processing papers:  64%|██████████████████████████████████████▉                      | 326/511 [16:12<07:38,  2.48s/it]

Title: Video-LaVIT: Unified Video-Language Pre-training with Decoupled Visual-Motional Tokenization
Evaluation Result: Title: Video-LaVIT: Unified Video-Language Pre-training with Decoupled Visual-Motional Tokenization

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "In this paper, we address such limitations in video-language pre-training with an efficient video decomposition that represents each video as keyframes and temporal motions."

Progress saved at index: 326


Processing papers:  64%|███████████████████████████████████████                      | 327/511 [16:15<07:46,  2.53s/it]

Title: Constrained Decoding for Cross-lingual Label Projection
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, for NLP tasks that involve fine-grained predictions on words and phrases, the performance of zero-shot cross-lingual transfer learning lags far behind supervised fine-tuning methods."

Progress saved at index: 327


Processing papers:  64%|███████████████████████████████████████▏                     | 328/511 [16:18<07:54,  2.59s/it]

Title: Evaluation of ChatGPT Usability as A Code Generation Tool
Evaluation Result: Title: Evaluation of ChatGPT Usability as A Code Generation Tool
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Our experiments demonstrated that ChatGPT is highly useful for generating R program code although it may fail on hard programming tasks." and "Our experiment also shows that it is hard for human developers to learn from experiences to improve the skill of using ChatGPT to generate code."

Progress saved at index: 328


Processing papers:  64%|███████████████████████████████████████▎                     | 329/511 [16:20<07:50,  2.58s/it]

Title: Best Practices for Text Annotation with Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Researchers have warned that the ostensible simplicity of LLMs can be misleading, as they are prone to bias, misunderstandings, and unreliable results."

Progress saved at index: 329


Processing papers:  65%|███████████████████████████████████████▍                     | 330/511 [16:23<07:23,  2.45s/it]

Title: Intent-based Prompt Calibration: Enhancing prompt optimization with synthetic boundary cases
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Prompt engineering is a challenging and important task due to the high sensitivity of Large Language Models (LLMs) to the given prompt and the inherent ambiguity of a textual task instruction."

Progress saved at index: 330


Processing papers:  65%|███████████████████████████████████████▌                     | 331/511 [16:25<07:17,  2.43s/it]

Title: Enhancing the Stability of LLM-based Speech Generation Systems through Self-Supervised Representations
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Nevertheless, they suffer from multiple stability issues at inference time, such as hallucinations, content skipping or speech repetitions."

Progress saved at index: 331


Processing papers:  65%|███████████████████████████████████████▋                     | 332/511 [16:27<07:02,  2.36s/it]

Title: UniMem: Towards a Unified View of Long-Context Large Language Models
Evaluation Result: Title: UniMem: Towards a Unified View of Long-Context Large Language Models
Paper: Long-context processing is a critical ability that constrains the applicability of large language models. Although there exist various methods devoted to enhancing the long-context processing ability of large language models (LLMs), they are developed in an isolated manner and lack systematic analysis and integration of their strengths, hindering further developments. In this paper, we introduce UniMem, a unified framework that reformulates existing long-context methods from the

Progress saved at index: 332


Processing papers:  65%|███████████████████████████████████████▊                     | 333/511 [16:31<07:51,  2.65s/it]

Title: Conversation Reconstruction Attack Against GPT Models
Evaluation Result: **Title: Conversation Reconstruction Attack Against GPT Models**

- **Does it talk about LLMs:** Yes.
- **Rate Limitations of LLMs:** 5.
- **Evidence:** "Our results reveal the concern about privacy risks associated with conversations involving GPT models and aim to draw the community's attention to prevent the potential misuse of these models' remarkable capabilities."

Progress saved at index: 333


Processing papers:  65%|███████████████████████████████████████▊                     | 334/511 [16:34<08:16,  2.81s/it]

Title: Unsupervised semantic segmentation of high-resolution UAV imagery for road scene parsing
Evaluation Result: Title: Unsupervised semantic segmentation of high-resolution UAV imagery for road scene parsing

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "a vision language model is employed to efficiently process ultra-large resolution UAV images to quickly detect road regions of interest in the images."

Progress saved at index: 334


Processing papers:  66%|███████████████████████████████████████▉                     | 335/511 [16:36<08:12,  2.80s/it]

Title: Enhancing Textbook Question Answering Task with Large Language Models and Retrieval Augmented Generation
Evaluation Result: Title: Enhancing Textbook Question Answering Task with Large Language Models and Retrieval Augmented Generation

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "there are still some limitations including the models' weak reasoning and inability to capture contextual information in the lengthy context," and "directly applying LLMs often leads to inaccurate answers."

Progress saved at index: 335


Processing papers:  66%|████████████████████████████████████████                     | 336/511 [16:39<08:08,  2.79s/it]

Title: LLM Agents in Interaction: Measuring Personality Consistency and Linguistic Alignment in Interacting Populations of Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "highlights the need for new approaches to crafting robust, more human-like LLM personas for interactive environments."

Progress saved at index: 336


Processing papers:  66%|████████████████████████████████████████▏                    | 337/511 [16:41<07:24,  2.56s/it]

Title: Shortened LLaMA: A Simple Depth Pruning for Large Language Models
Evaluation Result: Title: Shortened LLaMA: A Simple Depth Pruning for Large Language Models

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Structured pruning of modern large language models (LLMs) has emerged as a way of decreasing their high computational needs."

Progress saved at index: 337


Processing papers:  66%|████████████████████████████████████████▎                    | 338/511 [16:44<07:07,  2.47s/it]

Title: Evading Data Contamination Detection for Language Models is (too) Easy
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, the vast amount of data these models are trained on can inadvertently lead to contamination with public benchmarks, thus compromising performance measurements." and "we propose a categorization of both model providers and contamination detection methods. This reveals vulnerabilities in existing methods that we exploit with EAL, a simple yet effective contamination technique that significantly inflates benchmark performance while completely evading current detection methods."

Progress saved at index: 338


Processing papers:  66%|████████████████████████████████████████▍                    | 339/511 [16:46<07:15,  2.53s/it]

Title: Graph-enhanced Large Language Models in Asynchronous Plan Reasoning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We find that a representative set of closed and open-source LLMs, including GPT-4 and LLaMA-2, behave poorly when not supplied with illustrations about the task-solving process in our benchmark AsyncHow." and "LLMs still suffer from drastic degradation when task complexity increases, highlighting the limits of utilizing LLMs for simulating digital devices."

Progress saved at index: 339


Processing papers:  67%|████████████████████████████████████████▌                    | 340/511 [16:49<07:53,  2.77s/it]

Title: Large Language Model Distilling Medication Recommendation Model
Evaluation Result: **Title: Large Language Model Distilling Medication Recommendation Model**

**Does it talk about LLMs:** Yes.  
**Rate Limitations of LLMs:** 3.  
**Evidence:** "However, the straightforward integration of LLMs into recommender systems leads to an out-of-corpus issue specific to drugs." and "LLM-based models exhibit remarkable capabilities, they are plagued by high computational costs during inference, which is impractical for the healthcare sector."

Progress saved at index: 340


Processing papers:  67%|████████████████████████████████████████▋                    | 341/511 [16:53<08:08,  2.87s/it]

Title: KS-Lottery: Finding Certified Lottery Tickets for Multilingual Language Models
Evaluation Result: Title: KS-Lottery: Finding Certified Lottery Tickets for Multilingual Language Models

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: The abstract discusses a method for fine-tuning LLMs and finding effective subsets of parameters but does not mention any explicit limitations of the models.

Progress saved at index: 341


Processing papers:  67%|████████████████████████████████████████▊                    | 342/511 [16:56<08:50,  3.14s/it]

Title: Rethinking Optimization and Architecture for Tiny Language Models
Evaluation Result: Title: Rethinking Optimization and Architecture for Tiny Language Models

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, the application of language models on mobile devices is facing huge challenge on the computation and memory costs, that is, tiny language models with high performance are urgently required."

Progress saved at index: 342


Processing papers:  67%|████████████████████████████████████████▉                    | 343/511 [16:59<08:19,  2.97s/it]

Title: List-aware Reranking-Truncation Joint Model for Search and Retrieval-augmented Generation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "retrieval-augmented generation for large language models (LLMs)."

Progress saved at index: 343


Processing papers:  67%|█████████████████████████████████████████                    | 344/511 [17:01<07:22,  2.65s/it]

Title: DeAL: Decoding-time Alignment for Large Language Models
Evaluation Result: Title: DeAL: Decoding-time Alignment for Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "First, the inability to incorporate multiple, custom rewards and reliance on a model developer's view of universal and static principles are key limitations. Second, the residual gaps in model training and the reliability of such approaches are also questionable (e.g. susceptibility to jail-breaking even after safety training)."

Progress saved at index: 344


Processing papers:  68%|█████████████████████████████████████████▏                   | 345/511 [17:04<07:32,  2.72s/it]

Title: Illuminate: A novel approach for depression detection with explainable analysis and proactive therapy using prompt engineering
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "These LLMs are fine-tuned with specialized prompts to diagnose, explain, and suggest therapeutic interventions for depression."

Progress saved at index: 345


Processing papers:  68%|█████████████████████████████████████████▎                   | 346/511 [17:06<06:58,  2.54s/it]

Title: KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "the key-value (KV) cache, which stores attention keys and values to avoid re-computations, significantly increases memory demands and becomes the new bottleneck in speed and memory usage."

Progress saved at index: 346


Processing papers:  68%|█████████████████████████████████████████▍                   | 347/511 [17:08<06:40,  2.45s/it]

Title: Understanding the planning of LLM agents: A survey
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "further challenges for the field of research are discussed."

Progress saved at index: 347


Processing papers:  68%|█████████████████████████████████████████▌                   | 348/511 [17:10<06:20,  2.34s/it]

Title: Adversarial Text Purification: A Large Language Model Approach for Defense
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We propose a novel adversarial text purification that harnesses the generative capabilities of Large Language Models (LLMs) to purify adversarial text without the need to explicitly characterize the discrete noise perturbations."

Progress saved at index: 348


Processing papers:  68%|█████████████████████████████████████████▋                   | 349/511 [17:13<06:31,  2.41s/it]

Title: Large Language Models are Geographically Biased
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Large Language Models (LLMs) inherently carry the biases contained in their training corpora, which can lead to the perpetuation of societal harm." and "We show various problematic geographic biases, which we define as systemic errors in geospatial predictions." and "LLMs exhibit common biases across a range of objective and subjective topics" and "LLMs are clearly biased against locations with lower

Progress saved at index: 349


Processing papers:  68%|█████████████████████████████████████████▊                   | 350/511 [17:16<07:23,  2.76s/it]

Title: Open-Universe Indoor Scene Generation using LLM Program Synthesis and Uncurated Object Databases
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Instead, it leverages the world knowledge encoded in pre-trained large language models (LLMs) to synthesize programs in a domain-specific layout language that describe objects and spatial relations between them."

Progress saved at index: 350


Processing papers:  69%|█████████████████████████████████████████▉                   | 351/511 [17:19<07:00,  2.63s/it]

Title: RACER: An LLM-powered Methodology for Scalable Analysis of Semi-structured Mental Health Interviews
Evaluation Result: Title: RACER: An LLM-powered Methodology for Scalable Analysis of Semi-structured Mental Health Interviews

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Interestingly, LLMs and humans struggle with similar content involving nuanced emotional, ambivalent/dialectical, and psychological statements. Our study highlights the opportunities and challenges in using LLMs to improve research efficiency."

Progress saved at index: 351


Processing papers:  69%|██████████████████████████████████████████                   | 352/511 [17:21<06:55,  2.61s/it]

Title: Recursive Chain-of-Feedback Prevents Performance Degradation from Redundant Prompting
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Large Language Models (LLMs) frequently struggle with complex reasoning tasks, failing to construct logically sound steps towards the solution." and "repeated meaningless feedback gradually decreases the quality of the responses, eventually leading to a larger deviation from the intended outcome."

Progress saved at index: 352


Processing papers:  69%|██████████████████████████████████████████▏                  | 353/511 [17:24<06:59,  2.66s/it]

Title: Zero-Shot Clinical Trial Patient Matching with LLMs
Evaluation Result: Title: Zero-Shot Clinical Trial Patient Matching with LLMs
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Large language models (LLMs) offer a promising solution."

Progress saved at index: 353


Processing papers:  69%|██████████████████████████████████████████▎                  | 354/511 [17:26<06:38,  2.54s/it]

Title: LLM-Enhanced Data Management
Evaluation Result: Title: LLM-Enhanced Data Management  
Does it talk about LLMs: Yes.  
Rate Limitations of LLMs: 5.  
Evidence: "existing LLMs have several limitations: hallucination, high cost, and low accuracy for complicated tasks."

Progress saved at index: 354


Processing papers:  69%|██████████████████████████████████████████▍                  | 355/511 [17:29<06:25,  2.47s/it]

Title: Can Large Language Models Learn Independent Causal Mechanisms?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Large Language Models (LLMs) fall short on the same tasks in uncommon settings or with distribution shifts, exhibiting some lack of generalisation ability."

Progress saved at index: 355


Processing papers:  70%|██████████████████████████████████████████▍                  | 356/511 [17:31<06:09,  2.38s/it]

Title: Predicting Machine Translation Performance on Low-Resource Languages: The Role of Domain Similarity
Evaluation Result: Does it talk about LLMs: Yes.  
Rate Limitations of LLMs: 3.  
Evidence: "Fine-tuning and testing a multilingual large language model is expensive and challenging for low-resource languages (LRLs)."

Progress saved at index: 356


Processing papers:  70%|██████████████████████████████████████████▌                  | 357/511 [17:33<06:14,  2.43s/it]

Title: UniTSyn: A Large-Scale Dataset Capable of Enhancing the Prowess of Large Language Models for Program Testing
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "existing code LLMs often demonstrate unsatisfactory capabilities in generating accurate and complete tests since they were trained on code snippets collected without differentiating between code for testing purposes and other code."

Progress saved at index: 357


Processing papers:  70%|██████████████████████████████████████████▋                  | 358/511 [17:36<06:10,  2.42s/it]

Title: PuzzleBench: Can LLMs Solve Challenging First-Order Combinatorial Reasoning Problems?
Evaluation Result: Title: PuzzleBench: Can LLMs Solve Challenging First-Order Combinatorial Reasoning Problems?

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We first observe that LLMs, even when aided by symbolic solvers, perform rather poorly on our dataset."

Progress saved at index: 358


Processing papers:  70%|██████████████████████████████████████████▊                  | 359/511 [17:39<06:45,  2.67s/it]

Title: A Truly Joint Neural Architecture for Segmentation and Parsing
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "This proposed architecture is LLM-based and language agnostic, providing a solid foundation for MRLs to obtain further performance improvements and bridge the gap with other languages."

Progress saved at index: 359


Processing papers:  70%|██████████████████████████████████████████▉                  | 360/511 [17:41<06:26,  2.56s/it]

Title: DefInt: A Default-interventionist Framework for Efficient Reasoning with Hybrid Large Language Models
Evaluation Result: Title: DefInt: A Default-interventionist Framework for Efficient Reasoning with Hybrid Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Large language models (LLMs) have shown impressive emergent abilities in a wide range of tasks, but still face challenges in handling complex reasoning problems." and "Previous works like chain-of-thought (CoT) and tree-of-thoughts (ToT) have predomin

Progress saved at index: 360


Processing papers:  71%|███████████████████████████████████████████                  | 361/511 [17:44<06:28,  2.59s/it]

Title: Enhancing Robustness in Biomedical NLI Models: A Probing Approach for Clinical Trials
Evaluation Result: Title: Enhancing Robustness in Biomedical NLI Models: A Probing Approach for Clinical Trials
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Large Language Models are susceptible to shortcut learning, factual inconsistency, and performance degradation with little variation in context."

Progress saved at index: 361


Processing papers:  71%|███████████████████████████████████████████▏                 | 362/511 [17:47<06:34,  2.65s/it]

Title: Are Large Language Models Table-based Fact-Checkers?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Existing TFV methods based on small-scaled models suffer from insufficient labeled data and weak zero-shot ability." and "We also make some valuable findings about the format of zero-shot prompts and the number of in-context examples."

Progress saved at index: 362


Processing papers:  71%|███████████████████████████████████████████▎                 | 363/511 [17:49<06:28,  2.62s/it]

Title: Knowledge Generation for Zero-shot Knowledge-based VQA
Evaluation Result: Title: Knowledge Generation for Zero-shot Knowledge-based VQA

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, these recent methods do not explicitly show the knowledge needed to answer the questions and thus lack interpretability."

Progress saved at index: 363


Processing papers:  71%|███████████████████████████████████████████▍                 | 364/511 [17:52<06:16,  2.56s/it]

Title: CompeteSMoE -- Effective Training of Sparse Mixture of Experts via Competition
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "In this work, we propose a competition mechanism to address this fundamental challenge of representation collapse."

Progress saved at index: 364


Processing papers:  71%|███████████████████████████████████████████▌                 | 365/511 [17:54<05:48,  2.39s/it]

Title: Conversational Crowdsensing: A Parallel Intelligence Powered Novel Sensing Approach
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "new requirements and opportunities to current sensing approaches, especially in light of recent progress in Chatbots and Large Language Models (LLMs)."

Progress saved at index: 365


Processing papers:  72%|███████████████████████████████████████████▋                 | 366/511 [17:56<05:59,  2.48s/it]

Title: GeReA: Question-Aware Prompt Captions for Knowledge-based Visual Question Answering
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, such conversion may introduce irrelevant information, which causes the LLM to misinterpret images and ignore visual details crucial for accurate knowledge."

Progress saved at index: 366


Processing papers:  72%|███████████████████████████████████████████▊                 | 367/511 [17:58<05:39,  2.36s/it]

Title: Navigating the Peril of Generated Alternative Facts: A ChatGPT-4 Fabricated Omega Variant Case as a Cautionary Tale in Medical Misinformation
Evaluation Result: Title: Navigating the Peril of Generated Alternative Facts: A ChatGPT-4 Fabricated Omega Variant Case as a Cautionary Tale in Medical Misinformation

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "The ease with which AI can generate believable but false scientific information, as illustrated in this case, raises significant concerns about the potential for misinformation in medicine."

Progress saved at index: 367


Processing papers:  72%|███████████████████████████████████████████▉                 | 368/511 [18:02<06:35,  2.77s/it]

Title: BRAIn: Bayesian Reward-conditioned Amortized Inference for natural language generation from feedback
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, they leave behind important features of the PPO approach. Methods such as SLiC or RRHF make use of the Reward Model (RM) only for ranking/preference, losing fine-grained information and ignoring the parametric form of the RM (eg., Bradley-Terry, Plackett-Luce), while methods such as DPO do not use even a separate reward

Progress saved at index: 368


Processing papers:  72%|████████████████████████████████████████████                 | 369/511 [18:05<06:27,  2.73s/it]

Title: A Graph is Worth $K$ Words: Euclideanizing Graph using Pure Transformer
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention LLMs or their limitations.

Progress saved at index: 369


Processing papers:  72%|████████████████████████████████████████████▏                | 370/511 [18:07<05:57,  2.53s/it]

Title: FoldToken: Learning Protein Language via Vector Quantization and Beyond
Evaluation Result: Does it talk about LLMs: No.
Rate Limitations of LLMs: 1.
Evidence: The abstract does not mention language models or their limitations.

Progress saved at index: 370


Processing papers:  73%|████████████████████████████████████████████▎                | 371/511 [18:09<05:34,  2.39s/it]

Title: Breaking MLPerf Training: A Case Study on Optimizing BERT
Evaluation Result: Title: Breaking MLPerf Training: A Case Study on Optimizing BERT
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We present novel approaches for fast large-scale training of BERT model which individually ameliorates each component thereby leading to a new level of BERT training performance."

Progress saved at index: 371


Processing papers:  73%|████████████████████████████████████████████▍                | 372/511 [18:11<05:33,  2.40s/it]

Title: LQER: Low-Rank Quantization Error Reconstruction for LLMs
Evaluation Result: Title: LQER: Low-Rank Quantization Error Reconstruction for LLMs

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Post-training quantization of Large Language Models (LLMs) is challenging."

Progress saved at index: 372


Processing papers:  73%|████████████████████████████████████████████▌                | 373/511 [18:14<05:43,  2.49s/it]

Title: Factuality of Large Language Models in the Year 2024
Evaluation Result: Title: Factuality of Large Language Models in the Year 2024
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Unfortunately, in many cases, LLM responses are factually incorrect, which limits their applicability in real-world scenarios."

Progress saved at index: 373


Processing papers:  73%|████████████████████████████████████████████▋                | 374/511 [18:16<05:35,  2.45s/it]

Title: Aligner: Achieving Efficient Alignment through Weak-to-Strong Correction
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "RLHF encounters major challenges including training reward models, actor-critic engineering, and importantly, it requires access to LLM parameters."

Progress saved at index: 374


Processing papers:  73%|████████████████████████████████████████████▊                | 375/511 [18:19<05:24,  2.39s/it]

Title: GLaPE: Gold Label-agnostic Prompt Evaluation and Optimization for Large Language Model
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Despite the rapid progress of large language models (LLMs), their task performance remains sensitive to prompt design."

Progress saved at index: 375


Processing papers:  74%|████████████████████████████████████████████▉                | 376/511 [18:21<05:20,  2.37s/it]

Title: DeLLMa: A Framework for Decision Making Under Uncertainty with Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "we show that directly prompting LLMs on these types of decision-making problems yields poor results, especially as the problem complexity increases."

Progress saved at index: 376


Processing papers:  74%|█████████████████████████████████████████████                | 377/511 [18:23<05:11,  2.32s/it]

Title: KICGPT: Large Language Model with Knowledge in Context for Knowledge Graph Completion
Evaluation Result: Title: KICGPT: Large Language Model with Knowledge in Context for Knowledge Graph Completion
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Text-based methods alleviate this issue but require costly training for language models and specific finetuning for knowledge graphs, which limits their efficiency."

Progress saved at index: 377


Processing papers:  74%|█████████████████████████████████████████████                | 378/511 [18:27<06:00,  2.71s/it]

Title: Solution-oriented Agent-based Models Generation with Verifier-assisted Iterative In-context Learning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, LLMs excel in handling sequential information, making it challenging for analyzing the intricate interactions and nonlinear dynamics inherent in ABMs. Additionally, due to the lack of self-evaluation capability of LLMs, relying solely on LLMs is insufficient to effectively accomplish this process."

Progress saved at index: 378


Processing papers:  74%|█████████████████████████████████████████████▏               | 379/511 [18:30<06:09,  2.80s/it]

Title: Evaluating Large Language Models in Analysing Classroom Dialogue
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Results indicate substantial time savings with GPT-4, and a high degree of consistency in coding between the model and human coders, with some discrepancies in specific codes."

Progress saved at index: 379


Processing papers:  74%|█████████████████████████████████████████████▎               | 380/511 [18:32<05:44,  2.63s/it]

Title: AutoTimes: Autoregressive Time Series Forecasters via Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "increasing research demonstrates the feasibility of leveraging large language models (LLM) for time series" and "we propose AutoTimes to repurpose LLMs as Autoregressive Time series forecasters, which is consistent with the acquisition and utilization of LLMs without updating the parameters."

Progress saved at index: 380


Processing papers:  75%|█████████████████████████████████████████████▍               | 381/511 [18:35<05:55,  2.74s/it]

Title: Timer: Transformers for Time Series Analysis at Scale
Evaluation Result: **Title: Timer: Transformers for Time Series Analysis at Scale**

Does it talk about LLMs: No.

Rate Limitations of LLMs: 1.

Evidence: The abstract discusses the development of large time series models (LTSM) and their applications in time series analysis, but it does not mention language models (LLMs or LLMs) specifically.

Progress saved at index: 381


Processing papers:  75%|█████████████████████████████████████████████▌               | 382/511 [18:38<05:45,  2.68s/it]

Title: Advancing Graph Representation Learning with Large Language Models: A Comprehensive Survey of Techniques
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Despite a growing body of research dedicated to integrating LLMs into the graph domain, a comprehensive review that deeply analyzes the core components and operations within these models is notably lacking."

Progress saved at index: 382


Processing papers:  75%|█████████████████████████████████████████████▋               | 383/511 [18:40<05:32,  2.60s/it]

Title: Large Language Model Adaptation for Networking
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Motivated by the recent success of large language models (LLMs), for the first time, this work studies the LLM adaptation for networking to explore a more sustainable design philosophy."

Progress saved at index: 383


Processing papers:  75%|█████████████████████████████████████████████▊               | 384/511 [18:42<05:20,  2.53s/it]

Title: Enhance Reasoning for Large Language Models in the Game Werewolf
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "This paper presents an innovative framework that integrates Large Language Models (LLMs) with an external Thinker module to enhance the reasoning capabilities of LLM-based agents."

Progress saved at index: 384


Processing papers:  75%|█████████████████████████████████████████████▉               | 385/511 [18:45<05:10,  2.47s/it]

Title: A Survey of Large Language Models in Finance (FinLLMs)
Evaluation Result: Title: A Survey of Large Language Models in Finance (FinLLMs)
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Finally, we discuss the opportunities and the challenges facing FinLLMs, such as hallucination, privacy, and efficiency."

Progress saved at index: 385


Processing papers:  76%|██████████████████████████████████████████████               | 386/511 [18:47<05:07,  2.46s/it]

Title: Selecting Large Language Model to Fine-tune via Rectified Scaling Law
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "We also explain why existing scaling laws fail to capture this phase transition phenomenon both theoretically and empirically."

Progress saved at index: 386


Processing papers:  76%|██████████████████████████████████████████████▏              | 387/511 [18:50<05:08,  2.48s/it]

Title: Jailbreaking Attack against Multimodal Large Language Model
Evaluation Result: Title: Jailbreaking Attack against Multimodal Large Language Model
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "seeking to elicit MLLMs to generate objectionable responses to harmful user queries," and "we reveal a connection between MLLM-jailbreaks and LLM-jailbreaks."

Progress saved at index: 387


Processing papers:  76%|██████████████████████████████████████████████▎              | 388/511 [18:52<05:09,  2.52s/it]

Title: Large Language Model for Table Processing: A Survey
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Finally, we highlight several challenges, ranging from private deployment and efficient inference to the development of extensive benchmarks for table manipulation and advanced data analysis."

Progress saved at index: 388


Processing papers:  76%|██████████████████████████████████████████████▍              | 389/511 [18:55<04:58,  2.45s/it]

Title: SynthDST: Synthetic Data is All You Need for Few-Shot Dialog State Tracking
Evaluation Result: Title: SynthDST: Synthetic Data is All You Need for Few-Shot Dialog State Tracking
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, the best-performing in-context learning methods involve retrieving and adding similar examples to the prompt, requiring access to labeled training data. Procuring such training data for a wide range of domains and applications is time-consuming, expensive, and, at times, infeasible."

Progress saved at index: 389


Processing papers:  76%|██████████████████████████████████████████████▌              | 390/511 [18:59<05:52,  2.91s/it]

Title: Frequency Explains the Inverse Correlation of Large Language Models' Size, Training Data Amount, and Surprisal's Fit to Reading Times
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Recent studies have shown that as Transformer-based language models become larger and are trained on very large amounts of data, the fit of their surprisal estimates to naturalistic human reading times degrades."

Progress saved at index: 390


Processing papers:  77%|██████████████████████████████████████████████▋              | 391/511 [19:01<05:47,  2.89s/it]

Title: Beyond the Limits: A Survey of Techniques to Extend the Context Length in Large Language Models
Evaluation Result: **Title:** Beyond the Limits: A Survey of Techniques to Extend the Context Length in Large Language Models
**Paper:** Recently, large language models (LLMs) have shown remarkable capabilities including understanding context, engaging in logical reasoning, and generating responses. However, this is achieved at the expense of stringent computational and memory requirements, hindering their ability to effectively support long input sequences. This survey provides an inclusive review of the recent techniques and methods devised to extend the sequence length in LLMs, thereby

Progress saved at index: 391


Processing papers:  77%|██████████████████████████████████████████████▊              | 392/511 [19:04<05:38,  2.84s/it]

Title: Language Writ Large: LLMs, ChatGPT, Grounding, Meaning and Understanding
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "These biases are inherent in the nature of language itself, at LLM scale, and they are closely linked to what it is that ChatGPT lacks, which is direct sensorimotor grounding to connect its words to their referents and its propositions to their meanings."

Progress saved at index: 392


Processing papers:  77%|██████████████████████████████████████████████▉              | 393/511 [19:08<05:56,  3.02s/it]

Title: Safety Fine-Tuning at (Almost) No Cost: A Baseline for Vision Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Current vision large language models (VLLMs) exhibit remarkable capabilities yet are prone to generate harmful content and are vulnerable to even the simplest jailbreaking attacks." and "Our initial analysis finds that this is due to the presence of harmful data during vision-language instruction fine-tuning, and that VLLM fine-tuning can cause forgetting of safety alignment previously learned by the underpinning L

Progress saved at index: 393


Processing papers:  77%|███████████████████████████████████████████████              | 394/511 [19:11<05:58,  3.06s/it]

Title: GPT-4V as Traffic Assistant: An In-depth Look at Vision Language Model on Complex Traffic Events
Evaluation Result: Title: GPT-4V as Traffic Assistant: An In-depth Look at Vision Language Model on Complex Traffic Events

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Concurrently, we also identify certain limitations of GPT-4V, which constrain its understanding in more intricate scenarios."

Progress saved at index: 394


Processing papers:  77%|███████████████████████████████████████████████▏             | 395/511 [19:14<05:48,  3.01s/it]

Title: Vi(E)va LLM! A Conceptual Stack for Evaluating and Interpreting Generative AI-based Visualizations
Evaluation Result: Title: Vi(E)va LLM! A Conceptual Stack for Evaluating and Interpreting Generative AI-based Visualizations
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "At the same time, several pitfalls, like the multiple ways of instructing an LLM to generate the desired result, the different perspectives leading the generation (code-based, image-based, grammar-based), and the presence of hallucinations even for the

Progress saved at index: 395


Processing papers:  77%|███████████████████████████████████████████████▎             | 396/511 [19:17<05:48,  3.03s/it]

Title: Analyzing Sentiment Polarity Reduction in News Presentation through Contextual Perturbation and Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "a context-aware masked language model"

Progress saved at index: 396


Processing papers:  78%|███████████████████████████████████████████████▍             | 397/511 [19:19<05:15,  2.76s/it]

Title: Do Moral Judgment and Reasoning Capability of LLMs Change with Language? A Study using the Multilingual Defining Issues Test
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Our study shows that the moral reasoning ability for all models, as indicated by the post-conventional score, is substantially inferior for Hindi and Swahili, compared to Spanish, Russian, Chinese and English, while there is no clear trend for the performance of the latter four languages."

Progress saved at index: 397


Processing papers:  78%|███████████████████████████████████████████████▌             | 398/511 [19:22<05:22,  2.86s/it]

Title: Rendering Graphs for Graph Reasoning in Multimodal Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Though LLMs can comprehend graph information in a textual format, they overlook the rich visual modality, which is an intuitive way for humans to comprehend structural information and conduct graph reasoning."

Progress saved at index: 398


Processing papers:  78%|███████████████████████████████████████████████▋             | 399/511 [19:25<05:33,  2.98s/it]

Title: Are Large Language Models Good Prompt Optimizers?
Evaluation Result: Title: Are Large Language Models Good Prompt Optimizers?
Paper: LLM-based Automatic Prompt Optimization, which typically utilizes LLMs as Prompt Optimizers to self-reflect and refine prompts, has shown promising performance in recent studies. Despite the success, the underlying mechanism of this approach remains unexplored, and the true effectiveness of LLMs as Prompt Optimizers requires further validation. In this work, we conducted a comprehensive study to uncover the actual mechanism of LLM-based Prompt Optimization. Our findings reveal

Progress saved at index: 399


Processing papers:  78%|███████████████████████████████████████████████▋             | 400/511 [19:28<05:26,  2.94s/it]

Title: Break the Sequential Dependency of LLM Inference Using Lookahead Decoding
Evaluation Result: Title: Break the Sequential Dependency of LLM Inference Using Lookahead Decoding  
Does it talk about LLMs: Yes.  
Rate Limitations of LLMs: 3.  
Evidence: "Autoregressive decoding of large language models (LLMs) is memory bandwidth bounded, resulting in high latency and significant wastes of the parallel processing power of modern accelerators."

Progress saved at index: 400


Processing papers:  78%|███████████████████████████████████████████████▊             | 401/511 [19:30<05:07,  2.80s/it]

Title: Affordable Generative Agents
Evaluation Result: Title: Affordable Generative Agents
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "the substantial cost on maintaining the prolonged agent interactions poses challenge over the deployment of believable LLM-based agents," and "demonstrating that agents can only generate finite behaviors in fixed environments."

Progress saved at index: 401


Processing papers:  79%|███████████████████████████████████████████████▉             | 402/511 [19:33<04:53,  2.69s/it]

Title: Panacea: Pareto Alignment via Preference Adaptation for LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Current methods for large language model alignment typically use scalar human preference labels. However, this convention tends to oversimplify the multi-dimensional and heterogeneous nature of human preferences, leading to reduced expressivity and even misalignment."

Progress saved at index: 402


Processing papers:  79%|████████████████████████████████████████████████             | 403/511 [19:35<04:46,  2.65s/it]

Title: A Closer Look at the Limitations of Instruction Tuning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "While IT has achieved notable success and widespread adoption, its limitations and shortcomings remain underexplored. In this paper, through rigorous experiments and an in-depth analysis of the changes LLMs undergo through IT, we reveal various limitations of IT."

Progress saved at index: 403


Processing papers:  79%|████████████████████████████████████████████████▏            | 404/511 [19:38<04:42,  2.64s/it]

Title: How well do LLMs cite relevant medical references? An evaluation framework and analyses
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Interestingly, we find that between ~50% to 90% of LLM responses are not fully supported by the sources they provide." and "Given the rapid pace of LLM development and the potential harms of incorrect or outdated medical information, it is crucial to also understand and quantify their capability to produce relevant, trustworthy medical references."

Progress saved at index: 404


Processing papers:  79%|████████████████████████████████████████████████▎            | 405/511 [19:41<04:50,  2.74s/it]

Title: PresAIse, A Prescriptive AI Solution for Enterprises
Evaluation Result: Title: PresAIse, A Prescriptive AI Solution for Enterprises
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "the integration of large language models (LLMs) to bridge communication gaps via a conversation agent."

Progress saved at index: 405


Processing papers:  79%|████████████████████████████████████████████████▍            | 406/511 [19:43<04:38,  2.66s/it]

Title: Human-Centered Privacy Research in the Age of Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "The emergence of large language models (LLMs), and their increased use in user-facing systems, has led to substantial privacy concerns." and "To build usable, efficient, and privacy-friendly systems powered by these models with imperfect privacy properties, our goal is to initiate discussions to outline an agenda for conducting human-centered research on privacy issues in LLM-powered systems."

Progress saved at index: 406


Processing papers:  80%|████████████████████████████████████████████████▌            | 407/511 [19:46<04:42,  2.71s/it]

Title: Self-Debiasing Large Language Models: Zero-Shot Recognition and Reduction of Stereotypes
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Large language models (LLMs) have shown remarkable advances in language generation and understanding but are also prone to exhibiting harmful social biases."

Progress saved at index: 407


Processing papers:  80%|████████████████████████████████████████████████▋            | 408/511 [19:49<04:41,  2.73s/it]

Title: SOCIALITE-LLAMA: An Instruction-Tuned Model for Social Scientific Tasks
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, little is known about the effectiveness of instruction tuning on the social domain where implicit pragmatic cues are often needed to be captured."

Progress saved at index: 408


Processing papers:  80%|████████████████████████████████████████████████▊            | 409/511 [19:53<05:08,  3.02s/it]

Title: A Survey on Context-Aware Multi-Agent Systems: Techniques, Challenges and Future Directions
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, the challenge lies in enabling these agents to learn, reason, and navigate uncertainties in dynamic environments."

Progress saved at index: 409


Processing papers:  80%|████████████████████████████████████████████████▉            | 410/511 [19:55<04:40,  2.78s/it]

Title: MasonPerplexity at Multimodal Hate Speech Event Detection 2024: Hate Speech and Target Detection Using Transformer Ensembles
Evaluation Result: Title: MasonPerplexity at Multimodal Hate Speech Event Detection 2024: Hate Speech and Target Detection Using Transformer Ensembles

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We use an XLM-roBERTa-large model for sub-task A and an ensemble approach combining XLM-roBERTa-base, BERTweet-large, and BERT-base for sub-task B."

Progress saved at index: 410


Processing papers:  80%|█████████████████████████████████████████████████            | 411/511 [19:58<04:53,  2.94s/it]

Title: Large Language Model Agent for Hyper-Parameter Optimization
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "To address these issues, we introduce a novel paradigm leveraging Large Language Models (LLMs) to automate hyperparameter optimization across diverse machine learning tasks."

Progress saved at index: 411


Processing papers:  81%|█████████████████████████████████████████████████▏           | 412/511 [20:01<04:31,  2.74s/it]

Title: The RL/LLM Taxonomy Tree: Reviewing Synergies Between Reinforcement Learning and Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We use this taxonomy to explore the motivations behind the synergy of LLMs and RL and explain the reasons for its success, while pinpointing potential shortcomings and areas where further research is needed, as well as alternative methodologies that serve the same goal."

Progress saved at index: 412


Processing papers:  81%|█████████████████████████████████████████████████▎           | 413/511 [20:03<04:27,  2.73s/it]

Title: Leveraging Large Language Models for Structure Learning in Prompted Weak Supervision
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We further extend the use of LLMs in the loop to address one of the key challenges in weak supervision: learning the statistical dependency structure among supervision sources."

Progress saved at index: 413


Processing papers:  81%|█████████████████████████████████████████████████▍           | 414/511 [20:06<04:14,  2.62s/it]

Title: What Will My Model Forget? Forecasting Forgotten Examples in Language Model Refinement
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Language models deployed in the wild make errors. However, simply updating the model with the corrected error instances causes catastrophic forgetting -- the updated model makes errors on instances learned during the instruction tuning or upstream training phase."

Progress saved at index: 414


Processing papers:  81%|█████████████████████████████████████████████████▌           | 415/511 [20:08<04:10,  2.61s/it]

Title: (A)I Am Not a Lawyer, But...: Engaging Legal Experts towards Responsible LLM Policies for Legal Advice
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Beyond known issues like hallucinations, experts revealed novel legal problems, including that users' conversations with LLMs are not protected by attorney-client confidentiality or bound to professional ethics that guard against conflicted counsel or poor quality advice."

Progress saved at index: 415


Processing papers:  81%|█████████████████████████████████████████████████▋           | 416/511 [20:11<04:05,  2.59s/it]

Title: Cross-modality debiasing: using language to mitigate sub-population shifts in imaging
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Recent studies found inherent distributional robustness in multi-modality foundation models, such as the vision-language model CLIP, yet this robustness is vulnerable through parameter fine-tuning."

Progress saved at index: 416


Processing papers:  82%|█████████████████████████████████████████████████▊           | 417/511 [20:13<03:54,  2.49s/it]

Title: Peer-review-in-LLMs: Automatic Evaluation Method for LLMs in Open-environment
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Existing large language models (LLMs) evaluation methods typically focus on testing the performance on some closed-environment and domain-specific benchmarks with human annotations."

Progress saved at index: 417


Processing papers:  82%|█████████████████████████████████████████████████▉           | 418/511 [20:15<03:45,  2.42s/it]

Title: TravelPlanner: A Benchmark for Real-World Planning with Language Agents
Evaluation Result: **Title: TravelPlanner: A Benchmark for Real-World Planning with Language Agents**

**Paper:**
Planning has been part of the core pursuit for artificial intelligence since its conception, but earlier AI agents mostly focused on constrained settings because many of the cognitive substrates necessary for human-level planning have been lacking. Recently, language agents powered by large language models (LLMs) have shown interesting capabilities such as tool use and reasoning. Are these language agents capable of planning in more complex settings that are out of the

Progress saved at index: 418


Processing papers:  82%|██████████████████████████████████████████████████           | 419/511 [20:26<07:23,  4.82s/it]

Title: Stochastic Two Points Method for Deep Model Zeroth-order Optimization
Evaluation Result: Title: Stochastic Two Points Method for Deep Model Zeroth-order Optimization
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Building or fully fine-tuning such large models is usually prohibitive due to either hardware budget or lack of access to backpropagation."

Progress saved at index: 419


Processing papers:  82%|██████████████████████████████████████████████████▏          | 420/511 [20:28<06:15,  4.12s/it]

Title: MAGDi: Structured Distillation of Multi-Agent Interaction Graphs Improves Reasoning in Smaller Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, these involve long generations from multiple models across several rounds, making them expensive. Moreover, these multi-agent approaches fail to provide a final, single model for efficient inference."

Progress saved at index: 420


Processing papers:  82%|██████████████████████████████████████████████████▎          | 421/511 [20:31<05:23,  3.59s/it]

Title: KB-Plugin: A Plug-and-play Framework for Large Language Models to Induce Programs over Low-resourced Knowledge Bases
Evaluation Result: Title: KB-Plugin: A Plug-and-play Framework for Large Language Models to Induce Programs over Low-resourced Knowledge Bases
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "PI typically relies on a large number of parallel question-program pairs to make the LLM aware of the schema of the given KB, and is thus challenging for many low-resourced KBs that lack annotated data."

Progress saved at index: 421


Processing papers:  83%|██████████████████████████████████████████████████▍          | 422/511 [20:33<04:56,  3.33s/it]

Title: Style Vectors for Steering Generative Large Language Model
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "This research explores strategies for steering the output of large language models (LLMs) towards specific styles, such as sentiment, emotion, or writing style, by adding style vectors to the activations of hidden layers during text generation."

Progress saved at index: 422


Processing papers:  83%|██████████████████████████████████████████████████▍          | 423/511 [20:36<04:29,  3.06s/it]

Title: Leveraging Large Language Models for Analyzing Blood Pressure Variations Across Biological Sex from Scientific Literature
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "In this work, we employed GPT-35-turbo, a large language model (LLM), to automatically extract the mean and standard deviation values of BP for both males and females from a dataset comprising 25 million abstracts sourced from PubMed."

Progress saved at index: 423


Processing papers:  83%|██████████████████████████████████████████████████▌          | 424/511 [20:38<04:13,  2.91s/it]

Title: Foundation Model Sherpas: Guiding Foundation Models through Knowledge and Reasoning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, they exhibit numerous limitations that prevent their broader adoption in many real-world systems, which often require a higher bar for trustworthiness and usability."

Progress saved at index: 424


Processing papers:  83%|██████████████████████████████████████████████████▋          | 425/511 [20:40<03:49,  2.67s/it]

Title: TrustAgent: Towards Safe and Trustworthy LLM-based Agents through Agent Constitution
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "their trustworthiness remains an under-explored area" and "improving the safety dimension of trustworthiness in LLM-based agents."

Progress saved at index: 425


Processing papers:  83%|██████████████████████████████████████████████████▊          | 426/511 [20:43<03:44,  2.64s/it]

Title: Building Guardrails for Large Language Models
Evaluation Result: Title: Building Guardrails for Large Language Models
Paper: As Large Language Models (LLMs) become more integrated into our daily lives, it is crucial to identify and mitigate their risks, especially when the risks can have profound impacts on human users and societies. Guardrails, which filter the inputs or outputs of LLMs, have emerged as a core safeguarding technology. This position paper takes a deep look at current open-source solutions (Llama Guard, Nvidia NeMo, Guardrails AI), and

Progress saved at index: 426


Processing papers:  84%|██████████████████████████████████████████████████▉          | 427/511 [20:48<04:43,  3.37s/it]

Title: Ecologically rational meta-learned inference explains human category learning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "In this work, we demonstrate that large language models can generate cognitive tasks, specifically category learning tasks, that match the statistics of real-world tasks, thereby addressing the first challenge."

Progress saved at index: 427


Processing papers:  84%|███████████████████████████████████████████████████          | 428/511 [20:50<04:15,  3.08s/it]

Title: Homogenization Effects of Large Language Models on Human Creative Ideation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "different users tended to produce less semantically distinct ideas with ChatGPT than with an alternative CST" and "ChatGPT users generated a greater number of more detailed ideas, but felt less responsible for the ideas they generated."

Progress saved at index: 428


Processing papers:  84%|███████████████████████████████████████████████████▏         | 429/511 [20:53<04:10,  3.06s/it]

Title: An Empirical Analysis of Diversity in Argument Summarization
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We find that both general-purpose LLMs and dedicated KPA models exhibit this behavior, but have complementary strengths."

Progress saved at index: 429


Processing papers:  84%|███████████████████████████████████████████████████▎         | 430/511 [20:56<03:44,  2.77s/it]

Title: Decoding Speculative Decoding
Evaluation Result: Title: Decoding Speculative Decoding

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "The speedup provided by speculative decoding heavily depends on the choice of the draft model." and "our experiments indicate the contrary with throughput diminishing as the probability of generated tokens to be accepted by the target model increases."

Progress saved at index: 430


Processing papers:  84%|███████████████████████████████████████████████████▍         | 431/511 [20:58<03:44,  2.80s/it]

Title: K-Level Reasoning with Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "existing reasoning methods tend to falter in dynamic settings that require k-level thinking - a key concept not tackled by previous works."

Progress saved at index: 431


Processing papers:  85%|███████████████████████████████████████████████████▌         | 432/511 [21:01<03:29,  2.65s/it]

Title: A Comparative Analysis of Conversational Large Language Models in Knowledge-Based Text Generation
Evaluation Result: Title: A Comparative Analysis of Conversational Large Language Models in Knowledge-Based Text Generation

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "large language models, which offer great potential for conversational interaction but are prone to hallucinating, omitting, or producing conflicting information."

Progress saved at index: 432


Processing papers:  85%|███████████████████████████████████████████████████▋         | 433/511 [21:04<03:41,  2.85s/it]

Title: AMOR: A Recipe for Building Adaptable Modular Knowledge Agents Through Process Feedback
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "The notable success of large language models (LLMs) has sparked an upsurge in building language agents to complete various complex tasks."

Progress saved at index: 433


Processing papers:  85%|███████████████████████████████████████████████████▊         | 434/511 [21:07<03:34,  2.78s/it]

Title: Integrating Large Language Models in Causal Discovery: A Statistical Causal Approach
Evaluation Result: Title: Integrating Large Language Models in Causal Discovery: A Statistical Causal Approach

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "The proposed approach can thus address challenges such as dataset biases and limitations, illustrating the potential of LLMs to improve data-driven causal inference across diverse scientific domains."

Progress saved at index: 434


Processing papers:  85%|███████████████████████████████████████████████████▉         | 435/511 [21:10<03:38,  2.87s/it]

Title: LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We argue that auto-regressive LLMs cannot, by themselves, do planning or self-verification (which is after all a form of reasoning), and shed some light on the reasons for misunderstandings in the literature."

Progress saved at index: 435


Processing papers:  85%|████████████████████████████████████████████████████         | 436/511 [21:12<03:28,  2.78s/it]

Title: Distilling LLMs' Decomposition Abilities into Compact Language Models
Evaluation Result: Title: Distilling LLMs' Decomposition Abilities into Compact Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Large Language Models (LLMs) have demonstrated proficiency in their reasoning abilities, yet their large size presents scalability challenges and limits any further customization."

Progress saved at index: 436


Processing papers:  86%|████████████████████████████████████████████████████▏        | 437/511 [21:15<03:20,  2.71s/it]

Title: StepCoder: Improve Code Generation with Reinforcement Learning from Compiler Feedback
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "the lengthy code generated by LLMs in response to complex human requirements makes RL exploration a challenge. Also, since the unit tests may not cover the complicated code, optimizing LLMs by using these unexecuted code snippets is ineffective."

Progress saved at index: 437


Processing papers:  86%|████████████████████████████████████████████████████▎        | 438/511 [21:17<03:11,  2.63s/it]

Title: LLM-based NLG Evaluation: Current Status and Challenges
Evaluation Result: Title: LLM-based NLG Evaluation: Current Status and Challenges
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "we first give a taxonomy of LLM-based NLG evaluation methods, and discuss their pros and cons, respectively."

Progress saved at index: 438


Processing papers:  86%|████████████████████████████████████████████████████▍        | 439/511 [21:20<03:04,  2.56s/it]

Title: Continual Learning for Large Language Models: A Survey
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Large language models (LLMs) are not amenable to frequent re-training, due to high training costs arising from their massive scale." and "Moreover, informed by a discussion of benchmarks and evaluation, we identify several challenges and future work directions for this crucial task."

Progress saved at index: 439


Processing papers:  86%|████████████████████████████████████████████████████▌        | 440/511 [21:22<03:00,  2.54s/it]

Title: A Survey on Large Language Model Hallucination via a Creativity Perspective
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "This survey begins with a review of the taxonomy of hallucinations and their negative impact on LLM reliability in critical applications."

Progress saved at index: 440


Processing papers:  86%|████████████████████████████████████████████████████▋        | 441/511 [21:24<02:51,  2.45s/it]

Title: Beyond the Answers: Reviewing the Rationality of Multiple Choice Question Answering for the Evaluation of Large Language Models
Evaluation Result: Title: Beyond the Answers: Reviewing the Rationality of Multiple Choice Question Answering for the Evaluation of Large Language Models

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "our empirical findings suggest a notable disparity in the consistency of LLM responses, which we define as REsponse VAriability Syndrome (REVAS) of the LLMs, indicating that current MCQA-based benchmarks may not adequately capture the true capabilities of L

Progress saved at index: 441


Processing papers:  86%|████████████████████████████████████████████████████▊        | 442/511 [21:27<02:57,  2.57s/it]

Title: Preference-free Alignment Learning with Regularized Relevance Reward
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "our preliminary study reveals that reward models trained on human preference datasets tend to give higher scores to long off-topic responses than short on-topic ones" and "the relevance score obtained by a retriever alone is vulnerable to reward hacking, i.e., overoptimizing to undesired shortcuts."

Progress saved at index: 442


Processing papers:  87%|████████████████████████████████████████████████████▉        | 443/511 [21:30<02:57,  2.61s/it]

Title: Training-time Neuron Alignment through Permutation Subspace for Improving Linear Mode Connectivity and Model Fusion
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "However, these post-hoc methods, demanding extra computations, are less effective for larger, complex models (e.g., ViT, LLM) due to numerous permutation matrices."

Progress saved at index: 443


Processing papers:  87%|█████████████████████████████████████████████████████        | 444/511 [21:33<02:56,  2.63s/it]

Title: KTO: Model Alignment as Prospect Theoretic Optimization
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "We show that objectives for aligning LLMs with human feedback implicitly incorporate many of these biases -- the success of these objectives (e.g., DPO) over cross-entropy minimization can partly be ascribed to them being $\textit{human-aware loss functions}$ (HALOs)."

Progress saved at index: 444


Processing papers:  87%|█████████████████████████████████████████████████████        | 445/511 [21:36<03:00,  2.73s/it]

Title: Can MLLMs Perform Text-to-Image In-Context Learning?
Evaluation Result: Title: Can MLLMs Perform Text-to-Image In-Context Learning?
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "we uncover considerable difficulties MLLMs encounter in solving T2I-ICL. We identify the primary challenges as the inherent complexity of multimodality and image generation."

Progress saved at index: 445


Processing papers:  87%|█████████████████████████████████████████████████████▏       | 446/511 [21:38<02:54,  2.69s/it]

Title: Exploring the Limitations of Graph Reasoning in Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We highlight various limitations, biases, and properties of LLMs through this benchmarking process, such as an inverse relation to the average degrees of freedom of traversal per node in graphs, the overall negative impact of k-shot prompting on graph reasoning tasks, and a positive response bias which prevents LLMs from identifying the absence of a valid solution."

Progress saved at index: 446


Processing papers:  87%|█████████████████████████████████████████████████████▎       | 447/511 [21:42<03:10,  2.98s/it]

Title: The Human and the Mechanical: logos, truthfulness, and ChatGPT
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "‘Mechanical minds’ lack these two components: (i) they do not relate to reality and (ii) do not have endogenous evidence. Therefore they lack the ability to form a belief about the world and a veridicality judgments altogether."

Progress saved at index: 447


Processing papers:  88%|█████████████████████████████████████████████████████▍       | 448/511 [21:46<03:33,  3.39s/it]

Title: Efficient Causal Graph Discovery Using Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "While previous LLM-based methods have used a pairwise query approach, this requires a quadratic number of queries which quickly becomes impractical for larger causal graphs."

Progress saved at index: 448


Processing papers:  88%|█████████████████████████████████████████████████████▌       | 449/511 [21:49<03:10,  3.07s/it]

Title: Large Language Models for Time Series: A Survey
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "We address the inherent challenge of bridging the gap between LLMs' original text data training and the numerical nature of time series data, and explore strategies for transferring and distilling knowledge from LLMs to numerical time series analysis."

Progress saved at index: 449


Processing papers:  88%|█████████████████████████████████████████████████████▋       | 450/511 [21:51<02:58,  2.93s/it]

Title: Towards a Unified Language Model for Knowledge-Intensive Tasks Utilizing External Corpus
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "yet they often hallucinate, especially in knowledge-intensive tasks that require external knowledge sources."

Progress saved at index: 450


Processing papers:  88%|█████████████████████████████████████████████████████▊       | 451/511 [21:54<02:47,  2.79s/it]

Title: Efficient Prompt Caching via Embedding Similarity
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, it faces the challenge of significant resource consumption during inference."

Progress saved at index: 451


Processing papers:  88%|█████████████████████████████████████████████████████▉       | 452/511 [21:57<03:00,  3.06s/it]

Title: Faster and Lighter LLMs: A Survey on Current Challenges and Way Forward
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Despite the impressive performance of LLMs, their widespread adoption faces challenges due to substantial computational and memory requirements during inference."

Progress saved at index: 452


Processing papers:  89%|██████████████████████████████████████████████████████       | 453/511 [22:00<02:44,  2.84s/it]

Title: LLM-Detector: Improving AI-Generated Chinese Text Detection with Open-Source LLM Instruction Tuning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Existing AI-generated text detection models, such as based on BERT and RoBERTa, are prone to in-domain over-fitting, leading to poor out-of-domain (OOD) detection performance."

Progress saved at index: 453


Processing papers:  89%|██████████████████████████████████████████████████████▏      | 454/511 [22:02<02:36,  2.75s/it]

Title: CABINET: Content Relevance based Noise Reduction for Table Question Answering
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "The irrelevant parts act as noise and are distracting information, resulting in sub-optimal performance due to the vulnerability of LLMs to noise."

Progress saved at index: 454


Processing papers:  89%|██████████████████████████████████████████████████████▎      | 455/511 [22:04<02:26,  2.61s/it]

Title: ReEvo: Large Language Models as Hyper-Heuristics with Reflective Evolution
Evaluation Result: Title: ReEvo: Large Language Models as Hyper-Heuristics with Reflective Evolution
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "This paper introduces Language Hyper-Heuristics (LHHs), an emerging variant of Hyper-Heuristics that leverages LLMs for heuristic generation, featuring minimal manual intervention and open-ended heuristic spaces."

Progress saved at index: 455


Processing papers:  89%|██████████████████████████████████████████████████████▍      | 456/511 [22:07<02:25,  2.65s/it]

Title: A Multi-Agent Conversational Recommender System
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Unlike the aimless chit-chat that LLM excels at, CRS has a clear target. So it is imperative to control the dialogue flow in the LLM to successfully recommend appropriate items to the users. Furthermore, user feedback in CRS can assist the system in better modeling user preferences, which has been ignored by existing studies. However, simply prompting LLM to conduct conversational recommendation cannot address

Progress saved at index: 456


Processing papers:  89%|██████████████████████████████████████████████████████▌      | 457/511 [22:11<02:41,  2.99s/it]

Title: PokeLLMon: A Human-Parity Agent for Pokemon Battles with Large Language Models
Evaluation Result: Title: PokeLLMon: A Human-Parity Agent for Pokemon Battles with Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "The design of PokeLLMon incorporates three key strategies: (i) In-context reinforcement learning that instantly consumes text-based feedback derived from battles to iteratively refine the policy; (ii) Knowledge-augmented generation that retrieves external knowledge to counteract hallucination and enables the agent

Progress saved at index: 457


Processing papers:  90%|██████████████████████████████████████████████████████▋      | 458/511 [22:16<03:08,  3.55s/it]

Title: DTS-SQL: Decomposed Text-to-SQL with Small Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Leading models for the text-to-SQL task heavily rely on proprietary Large Language Models (LLMs), posing concerns over data privacy."

Progress saved at index: 458


Processing papers:  90%|██████████████████████████████████████████████████████▊      | 459/511 [22:18<02:47,  3.22s/it]

Title: Vaccine: Perturbation-aware Alignment for Large Language Model
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "The new paradigm of finetuning-as-a-service introduces a new attack surface for Large Language Models (LLMs): a few harmful data uploaded by users can easily trick the finetuning to produce an alignment-broken model."

Progress saved at index: 459


Processing papers:  90%|██████████████████████████████████████████████████████▉      | 460/511 [22:21<02:39,  3.12s/it]

Title: Reasoning Capacity in Multi-Agent Systems: Limitations, Challenges and Human-Centered Solutions
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Remarkable performance of large language models (LLMs) in a variety of tasks brings forth many opportunities as well as challenges of utilizing them in production settings." and "Despite the tremendous success of these systems, current approaches rely on narrow, single-focus objectives for optimization and evaluation, often overlooking potential constraints in real-world scenarios, including restricted budgets, resources and time."

Progress saved at index: 460


Processing papers:  90%|███████████████████████████████████████████████████████      | 461/511 [22:25<02:47,  3.35s/it]

Title: The Political Preferences of LLMs
Evaluation Result: Title: The Political Preferences of LLMs

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "The results indicate that when probed with questions/statements with political connotations most conversational LLMs tend to generate responses that are diagnosed by most political test instruments as manifesting preferences for left-of-center viewpoints." and "base models' suboptimal performance at coherently answering questions suggests caution when interpreting their classification by political orientation tests."

Progress saved at index: 461


Processing papers:  90%|███████████████████████████████████████████████████████▏     | 462/511 [22:29<02:50,  3.48s/it]

Title: LitLLM: A Toolkit for Scientific Literature Review
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Unfortunately, many existing works that generate such reviews using Large Language Models (LLMs) have significant limitations. They tend to hallucinate-generate non-actual information-and ignore the latest research they have not been trained on."

Progress saved at index: 462


Processing papers:  91%|███████████████████████████████████████████████████████▎     | 463/511 [22:32<02:37,  3.28s/it]

Title: Character-based Outfit Generation with Vision-augmented Style Extraction via LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "we propose a novel framework LVA-COG that leverages Large Language Models (LLMs) to extract insights from customer interests (e.g., character information) and employ prompt engineering techniques for accurate understanding of customer preferences."

Title: Mind the Gap: Understanding the Performance of Language Models on Zero-Shot Learning Tasks
Paper: Zero-shot learning (ZSL) is a challenging

Progress saved at index: 463


Processing papers:  91%|███████████████████████████████████████████████████████▍     | 464/511 [22:35<02:30,  3.21s/it]

Title: Specialized Language Models with Cheap Inference from Limited Domain Data
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Large language models have emerged as a versatile tool but are challenging to apply to tasks lacking large inference budgets and large in-domain training sets."

Progress saved at index: 464


Processing papers:  91%|███████████████████████████████████████████████████████▌     | 465/511 [22:37<02:15,  2.95s/it]

Title: Chameleon: Foundation Models for Fairness-aware Multi-modal Data Augmentation to Enhance Coverage of Minorities
Evaluation Result: **Title: Chameleon: Foundation Models for Fairness-aware Multi-modal Data Augmentation to Enhance Coverage of Minorities**

**Paper:** "The potential harms of the under-representation of minorities in training data, particularly in multi-modal settings, is a well-recognized concern. While there has been extensive effort in detecting such under-representation, resolution has remained a challenge. With recent advancements in generative AI, large language models and foundation models have emerged as versatile tools across various domains. In this

Progress saved at index: 465


Processing papers:  91%|███████████████████████████████████████████████████████▋     | 466/511 [22:40<02:19,  3.09s/it]

Title: Evaluation Methodology for Large Language Models for Multilingual Document Question and Answer
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "With the widespread adoption of Large Language Models (LLMs), in this paper we investigate the multilingual capability of these models."

Progress saved at index: 466


Processing papers:  91%|███████████████████████████████████████████████████████▋     | 467/511 [22:43<02:12,  3.02s/it]

Title: Plan-Grounded Large Language Models for Dual Goal Conversational Settings
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Yet, it is not completely clear how an LLM can lead a plan-grounded conversation in mixed-initiative settings where instructions flow in both directions of the conversation, i.e. both the LLM and the user provide instructions to one another."

Progress saved at index: 467


Processing papers:  92%|███████████████████████████████████████████████████████▊     | 468/511 [22:46<02:02,  2.84s/it]

Title: Generation, Distillation and Evaluation of Motivational Interviewing-Style Reflections with a Foundational Language Model
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Large Foundational Language Models are capable of performing many tasks at a high level but are difficult to deploy in many applications because of their size and proprietary ownership."

Progress saved at index: 468


Processing papers:  92%|███████████████████████████████████████████████████████▉     | 469/511 [22:48<01:54,  2.72s/it]

Title: IMUGPT 2.0: Language-Based Cross Modality Transfer for Sensor-Based Human Activity Recognition
Evaluation Result: Title: IMUGPT 2.0: Language-Based Cross Modality Transfer for Sensor-Based Human Activity Recognition
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "With the emergence of generative AI models such as large language models (LLMs) and text-driven motion synthesis models, language has become a promising source data modality as well as shown in proof of concepts such as IMUGPT."

Progress saved at index: 469


Processing papers:  92%|████████████████████████████████████████████████████████     | 470/511 [22:51<01:51,  2.73s/it]

Title: COA-GPT: Generative Pre-trained Transformers for Accelerated Course of Action Development in Military Operations
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "This study introduces COA-GPT, a novel algorithm employing Large Language Models (LLMs) for rapid and efficient generation of valid COAs."

Progress saved at index: 470


Processing papers:  92%|████████████████████████████████████████████████████████▏    | 471/511 [22:53<01:42,  2.57s/it]

Title: Getting the most out of your tokenizer for pre-training and domain adaptation
Evaluation Result: Title: Getting the most out of your tokenizer for pre-training and domain adaptation

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Tokenization is an understudied and often neglected component of modern LLMs."

Progress saved at index: 471


Processing papers:  92%|████████████████████████████████████████████████████████▎    | 472/511 [22:55<01:36,  2.47s/it]

Title: Repeat After Me: Transformers are Better than State Space Models at Copying
Evaluation Result: Title: Repeat After Me: Transformers are Better than State Space Models at Copying

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "Finally, we evaluate pretrained large language models and find that transformer models dramatically outperform state space models at copying and retrieving information from context."

Progress saved at index: 472


Processing papers:  93%|████████████████████████████████████████████████████████▍    | 473/511 [22:58<01:34,  2.47s/it]

Title: Executable Code Actions Elicit Better LLM Agents
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "LLM agents are typically prompted to produce actions by generating JSON or text in a pre-defined format, which is usually limited by constrained action space (e.g., the scope of pre-defined tools) and restricted flexibility (e.g., inability to compose multiple tools)."

Progress saved at index: 473


Processing papers:  93%|████████████████████████████████████████████████████████▌    | 474/511 [23:01<01:35,  2.58s/it]

Title: HR-MultiWOZ: A Task Oriented Dialogue (TOD) Dataset for HR LLM Agent
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, the aforementioned developments must grapple with the pivotal challenge of constructing a high-quality training dataset."

Progress saved at index: 474


Processing papers:  93%|████████████████████████████████████████████████████████▋    | 475/511 [23:03<01:31,  2.54s/it]

Title: When Benchmarks are Targets: Revealing the Sensitivity of Large Language Model Leaderboards
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Under existing leaderboards, the relative performance of LLMs is highly sensitive to (often minute) details."

Progress saved at index: 475


Processing papers:  93%|████████████████████████████████████████████████████████▊    | 476/511 [23:05<01:26,  2.46s/it]

Title: Evaluating Large Language Models for Generalization and Robustness via Data Compression
Evaluation Result: Title: Evaluating Large Language Models for Generalization and Robustness via Data Compression
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Existing methods for evaluating large language models face challenges such as data contamination, sensitivity to prompts, and the high cost of benchmark creation." and "We find that the compression rate of many models reduces significantly after their cutoff date," and "Results also suggest that models struggle to generalize on news and code

Progress saved at index: 476


Processing papers:  93%|████████████████████████████████████████████████████████▉    | 477/511 [23:08<01:29,  2.64s/it]

Title: Can Large Language Models Understand Context?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Experimental results indicate that pre-trained dense models struggle with understanding more nuanced contextual features when compared to state-of-the-art fine-tuned models" and "we assess the context understanding of quantized models under in-context-learning settings. We find that 3-bit post-training quantization leads to varying degrees of performance reduction on our benchmark."

Progress saved at index: 477


Processing papers:  94%|█████████████████████████████████████████████████████████    | 478/511 [23:11<01:28,  2.68s/it]

Title: Tiny Titans: Can Smaller Large Language Models Punch Above Their Weight in the Real World for Meeting Summarization?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, deploying LLMs in the real world is not trivial, as it requires substantial computing resources." and "we observe that most smaller LLMs, even after fine-tuning, fail to outperform larger zero-shot LLMs in meeting summarization datasets."

Progress saved at index: 478


Processing papers:  94%|█████████████████████████████████████████████████████████▏   | 479/511 [23:14<01:25,  2.68s/it]

Title: Formal-LLM: Integrating Formal Language and Natural Language for Controllable LLM-based Agents
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, since LLM's content generation process is hardly controllable, current LLM-based agents frequently generate invalid or non-executable plans, which jeopardizes the performance of the generated plans and corrupts users' trust in LLM-based agents."

Progress saved at index: 479


Processing papers:  94%|█████████████████████████████████████████████████████████▎   | 480/511 [23:16<01:21,  2.64s/it]

Title: LLMs learn governing principles of dynamical systems, revealing an in-context neural scaling law
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Pretrained large language models (LLMs) are surprisingly effective at performing zero-shot tasks, including time-series forecasting."

Progress saved at index: 480


Processing papers:  94%|█████████████████████████████████████████████████████████▍   | 481/511 [23:20<01:30,  3.00s/it]

Title: Dense Reward for Free in Reinforcement Learning from Human Feedback
Evaluation Result: Title: Dense Reward for Free in Reinforcement Learning from Human Feedback
Paper: Reinforcement Learning from Human Feedback (RLHF) has been credited as the key advance that has allowed Large Language Models (LLMs) to effectively follow instructions and produce useful assistance. Classically, this involves generating completions from the LLM in response to a query before using a separate reward model to assign a score to the full completion. As an auto-regressive process, the LLM has to take many "actions

Progress saved at index: 481


Processing papers:  94%|█████████████████████████████████████████████████████████▌   | 482/511 [23:24<01:29,  3.10s/it]

Title: Unlearnable Algorithms for In-context Learning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "In this paper, we focus on efficient unlearning methods for the task adaptation phase of a pretrained large language model (LLM)."

Progress saved at index: 482


Processing papers:  95%|█████████████████████████████████████████████████████████▋   | 483/511 [23:26<01:20,  2.87s/it]

Title: Health-LLM: Personalized Retrieval-Augmented Disease Prediction System
Evaluation Result: Title: Health-LLM: Personalized Retrieval-Augmented Disease Prediction System
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "However, there are conspicuous challenges such as vast data volumes and inconsistent symptom characterization standards, preventing full integration of healthcare AI systems with individual patients' needs."

Progress saved at index: 483


Processing papers:  95%|█████████████████████████████████████████████████████████▊   | 484/511 [23:29<01:16,  2.83s/it]

Title: Enhancing Ethical Explanations of Large Language Models through Iterative Symbolic Refinement
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Despite their success, however, LLMs are still prone to factual errors and inconsistencies in their explanations, offering limited control and interpretability for inference in complex domains."

Progress saved at index: 484


Processing papers:  95%|█████████████████████████████████████████████████████████▉   | 485/511 [23:31<01:11,  2.75s/it]

Title: Transforming and Combining Rewards for Aligning Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "This mitigates both underfitting (where some prompts are not improved) and reward hacking (where the model learns to exploit misspecification of the reward model)."

Progress saved at index: 485


Processing papers:  95%|██████████████████████████████████████████████████████████   | 486/511 [23:34<01:06,  2.67s/it]

Title: Intent Assurance using LLMs guided by Intent Drift
Evaluation Result: Title: Intent Assurance using LLMs guided by Intent Drift
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "we leverage AI-driven policies, generated by Large Language Models (LLMs) which can quickly learn the necessary in-context requirements, and assist with the fulfillment and assurance of intents."

Progress saved at index: 486


Processing papers:  95%|██████████████████████████████████████████████████████████▏  | 487/511 [23:37<01:06,  2.78s/it]

Title: Ocassionally Secure: A Comparative Analysis of Code Generation Assistants
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "These insights are crucial for understanding the models' capabilities and limitations, guiding future development and practical applications in the field of automated code generation."

Progress saved at index: 487


Processing papers:  95%|██████████████████████████████████████████████████████████▎  | 488/511 [23:39<01:02,  2.70s/it]

Title: Learning Planning-based Reasoning by Trajectories Collection and Process Reward Synthesizing
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "However, recent studies have raised concerns regarding the hallucination and flaws in their reasoning process... the planning-based search process often results in high latency due to the frequent assessment of intermediate reasoning states and the extensive exploration space. Additionally, supervising the reasoning process with human annotation is costly and challenging to scale for LLM training."

Progress saved at index: 488


Processing papers:  96%|██████████████████████████████████████████████████████████▎  | 489/511 [23:42<01:02,  2.82s/it]

Title: Vision-LLMs Can Fool Themselves with Self-Generated Typographic Attacks
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "We uncover that Self-Generated attacks pose a significant threat, reducing LVLM(s) classification performance by up to 33%."

Progress saved at index: 489


Processing papers:  96%|██████████████████████████████████████████████████████████▍  | 490/511 [23:46<01:04,  3.08s/it]

Title: Actor Identification in Discourse: A Challenge for LLMs?
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "Evaluating on a corpus of German actors in newspaper reports, we find surprisingly that the LLM performs worse. Further analysis reveals that the LLM is very good at identifying the right reference, but struggles to generate the correct canonical form. This points to an underlying issue in LLMs with controlling generated output."

Progress saved at index: 490


Processing papers:  96%|██████████████████████████████████████████████████████████▌  | 491/511 [23:50<01:03,  3.18s/it]

Title: Superfiltering: Weak-to-Strong Data Filtering for Fast Instruction-Tuning
Evaluation Result: Title: Superfiltering: Weak-to-Strong Data Filtering for Fast Instruction-Tuning

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "Instruction tuning is critical to improve LLMs but usually suffers from low-quality and redundant data." and "But it also leads to extra cost and computation due to the involvement of LLMs in this process."

Progress saved at index: 491


Processing papers:  96%|██████████████████████████████████████████████████████████▋  | 492/511 [23:52<00:57,  3.05s/it]

Title: EE-Tuning: An Economical yet Scalable Solution for Tuning Early-Exit Large Language Models
Evaluation Result: Title: EE-Tuning: An Economical yet Scalable Solution for Tuning Early-Exit Large Language Models
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "This work introduces EE-Tuning, a lightweight and economical solution to training/tuning early-exit large language models (LLMs)."

Progress saved at index: 492


Processing papers:  96%|██████████████████████████████████████████████████████████▊  | 493/511 [23:55<00:51,  2.88s/it]

Title: SA-MDKIF: A Scalable and Adaptable Medical Domain Knowledge Injection Framework for Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, their effective application in the medical domain is hampered by a lack of medical domain knowledge."

Progress saved at index: 493


Processing papers:  97%|██████████████████████████████████████████████████████████▉  | 494/511 [23:57<00:45,  2.66s/it]

Title: From PARIS to LE-PARIS: Toward Patent Response Automation with Recommender Systems and Collaborative Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "To bridge this gap, our study introduces the Patent Office Action Response Intelligence System (PARIS) and its advanced version, the Large Language Model (LLM) Enhanced PARIS (LE-PARIS)."

Progress saved at index: 494


Processing papers:  97%|███████████████████████████████████████████████████████████  | 495/511 [23:59<00:41,  2.60s/it]

Title: Prompt-Time Symbolic Knowledge Capture with Large Language Models
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 2.
Evidence: "LLMs inherently lack mechanisms for prompt-driven knowledge capture."

Progress saved at index: 495


Processing papers:  97%|███████████████████████████████████████████████████████████▏ | 496/511 [24:01<00:36,  2.45s/it]

Title: Hidding the Ghostwriters: An Adversarial Evaluation of AI-Generated Student Essay Detection
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "Large language models (LLMs) have exhibited remarkable capabilities in text generation tasks. However, the utilization of these models carries inherent risks, including but not limited to plagiarism, the dissemination of fake news, and issues in educational exercises."

Progress saved at index: 496


Processing papers:  97%|███████████████████████████████████████████████████████████▎ | 497/511 [24:04<00:36,  2.59s/it]

Title: Investigating Bias Representations in Llama 2 Chat via Activation Steering
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "We address the challenge of societal bias in Large Language Models (LLMs)," and "Our findings reveal inherent gender bias in Llama 2 7B Chat, persisting even after Reinforcement Learning from Human Feedback (RLHF)."

Progress saved at index: 497


Processing papers:  97%|███████████████████████████████████████████████████████████▍ | 498/511 [24:07<00:33,  2.61s/it]

Title: What Does the Bot Say? Opportunities and Risks of Large Language Models in Social Media Bot Detection
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "we explore the possibility of LLM-guided manipulation of user textual and structured information to evade detection" and "LLM-guided manipulation strategies could significantly bring down the performance of existing bot detectors by up to 29.6% and harm the calibration and reliability of bot detection systems."

Progress saved at index: 498


Processing papers:  98%|███████████████████████████████████████████████████████████▌ | 499/511 [24:10<00:30,  2.58s/it]

Title: Don't Hallucinate, Abstain: Identifying LLM Knowledge Gaps via Multi-LLM Collaboration
Evaluation Result: Title: Don't Hallucinate, Abstain: Identifying LLM Knowledge Gaps via Multi-LLM Collaboration

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "knowledge gaps -- missing or outdated information in LLMs -- might always persist given the evolving nature of knowledge" and "Motivated by their failures in self-reflection and over-reliance on held-out sets".

Progress saved at index: 499


Processing papers:  98%|███████████████████████████████████████████████████████████▋ | 500/511 [24:12<00:29,  2.69s/it]

Title: Safety of Multimodal Large Language Models on Images and Text
Evaluation Result: Title: Safety of Multimodal Large Language Models on Images and Text
Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "the vulnerabilities of MLLMs to unsafe instructions bring huge safety risks when these models are deployed in real-world scenarios."

Progress saved at index: 500


Processing papers:  98%|███████████████████████████████████████████████████████████▊ | 501/511 [24:15<00:26,  2.61s/it]

Title: Large Language Models Based Fuzzing Techniques: A Survey
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "Considering that existing fuzzing test techniques are not entirely automated and software vulnerabilities continue to evolve, there is a growing trend towards employing fuzzing test generated based on large language models."

Progress saved at index: 501


Processing papers:  98%|███████████████████████████████████████████████████████████▉ | 502/511 [24:18<00:24,  2.70s/it]

Title: IndiVec: An Exploration of Leveraging Large Language Models for Media Bias Detection with Fine-Grained Bias Indicators
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "we introduce a general bias detection framework, IndiVec, built upon large language models."

Progress saved at index: 502


Processing papers:  98%|████████████████████████████████████████████████████████████ | 503/511 [24:20<00:20,  2.55s/it]

Title: Redefining "Hallucination" in LLMs: Towards a psychology-informed framework for mitigating misinformation
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 5.
Evidence: "a notable challenge surfaces in the form of 'hallucinations.' This phenomenon results in LLMs outputting misinformation in a confident manner, which can lead to devastating consequences with such a large user base."

Progress saved at index: 503


Processing papers:  99%|████████████████████████████████████████████████████████████▏| 504/511 [24:24<00:20,  2.87s/it]

Title: Multimodal Embodied Interactive Agent for Cafe Scene
Evaluation Result: **Title:** Multimodal Embodied Interactive Agent for Cafe Scene  
**Does it talk about LLMs:** Yes.  
**Rate Limitations of LLMs:** 3.  
**Evidence:** "Nevertheless, prior works on embodied intelligence typically encode scene or historical memory in an unimodal manner, either visual or linguistic, which complicates the alignment of the model's action planning with embodied control."

Progress saved at index: 504


Processing papers:  99%|████████████████████████████████████████████████████████████▎| 505/511 [24:27<00:17,  2.88s/it]

Title: PAP-REC: Personalized Automatic Prompt for Recommendation Language Model
Evaluation Result: Title: PAP-REC: Personalized Automatic Prompt for Recommendation Language Model

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "handcrafted prompts require significant expertise and human effort since slightly rewriting prompts may cause massive performance changes."

Progress saved at index: 505


Processing papers:  99%|████████████████████████████████████████████████████████████▍| 506/511 [24:29<00:13,  2.69s/it]

Title: HiQA: A Hierarchical Contextual Augmentation RAG for Massive Documents QA
Evaluation Result: Title: HiQA: A Hierarchical Contextual Augmentation RAG for Massive Documents QA

Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "However, these methods exhibit limited retrieval accuracy when faced with massive indistinguishable documents, presenting notable challenges in their practical application."

Progress saved at index: 506


Processing papers:  99%|████████████████████████████████████████████████████████████▌| 507/511 [24:31<00:10,  2.62s/it]

Title: Does DetectGPT Fully Utilize Perturbation? Bridge Selective Perturbation to Fine-tuned Contrastive Learning Detector would be Better
Evaluation Result: Title: Does DetectGPT Fully Utilize Perturbation? Bridge Selective Perturbation to Fine-tuned Contrastive Learning Detector would be Better
Paper: The burgeoning generative capabilities of large language models (LLMs) have raised growing concerns about abuse, demanding automatic machine-generated text detectors. DetectGPT, a zero-shot metric-based detector, first introduces perturbation and shows great performance improvement. However, in DetectGPT, random perturbation strategy could introduce noise, and logit regression depends on threshold,

Progress saved at index: 507


Processing papers:  99%|████████████████████████████████████████████████████████████▋| 508/511 [24:35<00:09,  3.01s/it]

Title: Computational Experiments Meet Large Language Model Based Agents: A Survey and Perspective
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 4.
Evidence: "the absence of explicit explainability in LLMs significantly hinders their application in the social sciences."

Progress saved at index: 508


Processing papers: 100%|████████████████████████████████████████████████████████████▊| 509/511 [24:37<00:05,  2.79s/it]

Title: Towards scalable robotic intervention of children with Autism Spectrum Disorder using LLMs
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 1.
Evidence: "This communication is meant to teach perspective-taking using text generated using a Large Language Model (LLM) pipeline."

Progress saved at index: 509


Processing papers: 100%|████████████████████████████████████████████████████████████▉| 510/511 [24:40<00:02,  2.60s/it]

Title: Efficient Non-Parametric Uncertainty Quantification for Black-Box Large Language Models and Decision Planning
Evaluation Result: Does it talk about LLMs: Yes.
Rate Limitations of LLMs: 3.
Evidence: "This paper focuses on decision planning with uncertainty estimation to address the hallucination problem in language models."

Progress saved at index: 510


Processing papers: 100%|█████████████████████████████████████████████████████████████| 511/511 [24:42<00:00,  2.90s/it]

Processing complete.





### Calculating the Macro-F1 Score 

#### Simplifying Evaluation Metrics for Large Language Model Ratings
##### Objective
The aim is to simplify the evaluation of ratings for papers discussing large language models (LLMs) by grouping the original five rating classes into three distinct classes. This approach can help simplify the evaluation and make it more robust, especially when the distinctions between certain ratings (like 2 and 3) are not very clear.

Rationale
The original five rating classes are as follows:

##### Rating 1: Papers that do not talk about LLMs or do not mention any limitations of LLMs.
##### Rating 2: Papers that mention one limitation of LLMs very briefly.
##### Rating 3: Papers that mention limitations of LLMs, but not as the main focus. These limitations are discussed superficially or as secondary points.
##### Rating 4: Papers that provide multiple significant limitations of LLMs. The limitations are discussed in detail alongside other topics.
##### Rating 5: Papers where the entire abstract or most sentences focus on the limitations and challenges of LLMs, discussing them in detail with strong wording indicating serious issues.
Given this, we propose the following grouping into three classes:

##### Class 0: Papers that should be rated as 1. These papers do not talk about LLMs or do not mention any limitations of LLMs.
##### Class 1: Papers that should be rated as 2 or 3. These papers mention limitations of LLMs but not in detail and not as the main focus. The limitations are secondary points in the discussion.
##### Class 2: Papers that should be rated as 4 or 5. These papers primarily focus on the limitations of LLMs, discussing them in detail throughout the abstract.
Implementation
To implement this grouping, we convert the ratings into three classes using a predefined mapping and then calculate the macro F1 score for each prompting technique compared to the human-annotated ground truth. This ensures the evaluation metrics are independent and accurately reflect the performance of each technique.

In [3]:
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import os

file_path = os.path.expanduser('~/Desktop/model_vs_ground_truth_comparison.xlsx')
data = pd.read_excel(file_path)
data.columns = data.columns.str.strip()

data['Ground Truth'] = pd.to_numeric(data['Ground Truth'], errors='coerce')
data['Baseline Rate'] = pd.to_numeric(data['Baseline Rate'], errors='coerce')
data['MyPrompt Rate'] = pd.to_numeric(data['MyPrompt Rate'], errors='coerce')
data['MySimplerPrompt'] = pd.to_numeric(data['MySimplerPrompt'], errors='coerce')
data['FewShot'] = pd.to_numeric(data['FewShot'], errors='coerce')
data['FewShotExplanations'] = pd.to_numeric(data['FewShotExplanations'], errors='coerce')

data = data.dropna(subset=['Ground Truth', 'Baseline Rate', 'MyPrompt Rate', 'MySimplerPrompt', 'FewShot', 'FewShotExplanations'])

def convert_to_three_classes(rating):
    if rating == 1:
        return 0
    elif rating in [2, 3]:
        return 1
    elif rating in [4, 5]:
        return 2

data['Ground Truth 3-Class'] = data['Ground Truth'].apply(convert_to_three_classes)
data['Baseline Rate 3-Class'] = data['Baseline Rate'].apply(convert_to_three_classes)
data['MyPrompt Rate 3-Class'] = data['MyPrompt Rate'].apply(convert_to_three_classes)
data['MySimplerPrompt 3-Class'] = data['MySimplerPrompt'].apply(convert_to_three_classes)
data['FewShot 3-Class'] = data['FewShot'].apply(convert_to_three_classes)
data['FewShotExplanations 3-Class'] = data['FewShotExplanations'].apply(convert_to_three_classes)

ground_truth_3_class = data['Ground Truth 3-Class'].astype(int).tolist()

baseline_rate_3_class = data['Baseline Rate 3-Class'].astype(int).tolist()
my_prompt_rate_3_class = data['MyPrompt Rate 3-Class'].astype(int).tolist()
my_simpler_prompt_3_class = data['MySimplerPrompt 3-Class'].astype(int).tolist()
few_shot_3_class = data['FewShot 3-Class'].astype(int).tolist()
few_shot_explanations_3_class = data['FewShotExplanations 3-Class'].astype(int).tolist()

def calculate_metrics(ground_truth, predictions):
    metrics = {
        'accuracy': accuracy_score(ground_truth, predictions),
        'precision': precision_score(ground_truth, predictions, average='macro'),
        'recall': recall_score(ground_truth, predictions, average='macro'),
        'f1': f1_score(ground_truth, predictions, average='macro')
    }
    return metrics

metrics_baseline = calculate_metrics(ground_truth_3_class, baseline_rate_3_class)
metrics_my_prompt = calculate_metrics(ground_truth_3_class, my_prompt_rate_3_class)
metrics_my_simpler_prompt = calculate_metrics(ground_truth_3_class, my_simpler_prompt_3_class)
metrics_few_shot = calculate_metrics(ground_truth_3_class, few_shot_3_class)
metrics_few_shot_explanations = calculate_metrics(ground_truth_3_class, few_shot_explanations_3_class)

def print_metrics(metrics, model_name):
    print(f"{model_name} Metrics:")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1 Score: {metrics['f1']:.4f}\n")

print_metrics(metrics_baseline, "Prompt1: Standard Baseline Evaluation Prompt")
print_metrics(metrics_my_prompt, "Prompt2: Enhanced Instructional Detail Prompt")
print_metrics(metrics_my_simpler_prompt, "Prompt3: Categorical Instruction Grouping Prompt")
print_metrics(metrics_few_shot, "Prompt4: Selective Few-Shot Prompting Technique")
print_metrics(metrics_few_shot_explanations, "Prompt5: Detailed Explanatory Few-Shot Prompting")


Prompt1: Standard Baseline Evaluation Prompt Metrics:
  Accuracy: 0.5629
  Precision: 0.5843
  Recall: 0.6053
  F1 Score: 0.5522

Prompt2: Enhanced Instructional Detail Prompt Metrics:
  Accuracy: 0.6647
  Precision: 0.6468
  Recall: 0.6577
  F1 Score: 0.6453

Prompt3: Categorical Instruction Grouping Prompt Metrics:
  Accuracy: 0.6946
  Precision: 0.6601
  Recall: 0.6767
  F1 Score: 0.6629

Prompt4: Selective Few-Shot Prompting Technique Metrics:
  Accuracy: 0.7545
  Precision: 0.7219
  Recall: 0.6857
  F1 Score: 0.6982

Prompt5: Detailed Explanatory Few-Shot Prompting Metrics:
  Accuracy: 0.7904
  Precision: 0.7452
  Recall: 0.7384
  F1 Score: 0.7335

