In [4]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from pymilvus import MilvusClient
import json
from citegeist.utils.helpers import (
    load_api_key,
    generate_summary_prompt,
    generate_related_work_prompt,
)
from citegeist.utils.azure_client import AzureClient
from citegeist.utils.citations import get_arxiv_abstract, get_arxiv_citation
from dotenv import load_dotenv
import os

load_dotenv()

topic_model = BERTopic.load("MaartenGr/BERTopic_ArXiv")
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
client = MilvusClient("./database.db")
prompting_client = AzureClient(
    endpoint=os.getenv("AZURE_ENDPOINT"),
    deployment_id=os.getenv("AZURE_PROMPTING_MODEL"),
    api_key=load_api_key(os.getenv("KEY_LOCATION")),
)

In [5]:
abstract = "Large Language Models have shown impressive per- formance across a wide array of tasks involving both structured and unstructured textual data. More recently, adaptions of these models have drawn attention to their abilities to work with code across different programming languages. On this notion, different benchmarks for code generation, repair, or completion suggest that certain models have programming abilities comparable to or even surpass humans. In this work, we demonstrate that the performance on this benchmark does not translate to the innate ability of humans to appreciate the structural control flow of code. For this purpose, we extract code solutions from the Hu- manEval benchmark, which the relevant models perform very strongly on, and trace their execution path using function calls sampled from the respective test set. Using this dataset, we investigate the ability of 5 state-of-the-art LLMs to match the execution trace and find that, despite the model’s abilities to generate semantically identical code, they possess only limited ability to trace the execution path, especially for traces with increased length. We find that even the top-performing model, Gemini 1.5 Pro can only fully correctly generate the trace of 47% of HumanEval tasks. In addition, we introduce a specific subset for three key structures not, or only contained to a limited extent in Hu- manEval: Recursion, Parallel Processing, and Object Oriented Programming principles, including concepts like Inheritance and Polymorphism. Besides OOP, we show that none of the investigated models achieve an average accuracy of over 5% on the relevant traces. Aggregating these specialized parts with the ubiquitous HumanEval tasks, we present the Benchmark CoCoNUT: Code Control Flow for Navigation Understanding and Testing, which measures a models ability to trace the execu- tion of code upon relevant calls, including advanced structural components. We conclude that the current generation LLMs still need to significantly improve to enhance their code reasoning abilities. We hope our dataset can help researchers bridge this gap in the near future."
embedded_abstract = embedding_model.encode(abstract)
topic = topic_model.transform(abstract)
topic_id = topic[0][0]

res = client.search(
    collection_name="abstracts",
    data=[embedded_abstract],
    limit=10,
    # filter = f'topic == {topic_id}',
    search_params={"metric_type": "COSINE", "params": {}},
    # output_fields = []
)
formatted_res = json.dumps(res, indent=4)
print(formatted_res)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-28 22:36:04,542 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


[
    [
        {
            "id": "2408.10718",
            "distance": 0.8272675275802612,
            "entity": {}
        },
        {
            "id": "2408.13001",
            "distance": 0.8230882883071899,
            "entity": {}
        },
        {
            "id": "2410.21647",
            "distance": 0.812175989151001,
            "entity": {}
        },
        {
            "id": "2309.15432",
            "distance": 0.8043964505195618,
            "entity": {}
        },
        {
            "id": "2407.11470",
            "distance": 0.803741455078125,
            "entity": {}
        },
        {
            "id": "2407.19055",
            "distance": 0.8031219840049744,
            "entity": {}
        },
        {
            "id": "2311.08588",
            "distance": 0.8028398752212524,
            "entity": {}
        },
        {
            "id": "2402.08699",
            "distance": 0.8003471493721008,
            "entity": {}
        },
        {
        

In [7]:
# we need to remove the best match because that's the same input paper (this only has to be done for papers that are already in the arxiv corpus)
# res = res[0][1:]

res = res[0]

In [8]:
for obj in res:
    arxiv_id = obj["id"]
    arxiv_abstract = get_arxiv_abstract(arxiv_id)
    response: str = prompting_client.get_completions(
        generate_summary_prompt(abstract, arxiv_abstract),
        os.getenv("AZURE_PROMPTING_MODEL_VERSION"),
    )
    obj["summary"] = response
    obj["citation"] = get_arxiv_citation(arxiv_id)

In [10]:
print(generate_related_work_prompt(abstract, res))


    I am working on a research paper, and I need a well-written "Related Work" section. Below I'm providing you with the abstract of the paper I'm writing and a list of summaries of related works I've identified.
    
    Here's the abstract of my paper:
    "Large Language Models have shown impressive per- formance across a wide array of tasks involving both structured and unstructured textual data. More recently, adaptions of these models have drawn attention to their abilities to work with code across different programming languages. On this notion, different benchmarks for code generation, repair, or completion suggest that certain models have programming abilities comparable to or even surpass humans. In this work, we demonstrate that the performance on this benchmark does not translate to the innate ability of humans to appreciate the structural control flow of code. For this purpose, we extract code solutions from the Hu- manEval benchmark, which the relevant models perform ver

In [11]:
response: str = prompting_client.get_completions(
    generate_related_work_prompt(abstract, res),
    os.getenv("AZURE_PROMPTING_MODEL_VERSION"),
)
print(response)

In recent years, the evaluation of Large Language Models (LLMs) in code-related tasks has garnered significant attention, with various benchmarks being developed to assess different aspects of code understanding and generation. A notable contribution in this domain is CodeJudge-Eval, which evaluates LLMs' ability to judge the correctness of code solutions rather than merely generating code (Zhao et al., 2024). This approach complements our research by highlighting the limitations of current LLMs in tracing code execution paths, underscoring the need for improved benchmarks that capture the nuanced understanding of code beyond generation. Similarly, the RACE benchmark evaluates LLMs on multiple dimensions of code quality, such as readability and maintainability, emphasizing the need for comprehensive evaluation metrics to capture the multifaceted requirements of real-world code development (Zheng et al., 2024).

Another critical aspect of LLM evaluation is the consideration of programmi