In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from pymilvus import MilvusClient
import json
from citegeist.utils import load_api_key
from citegeist.utils import (
    generate_summary_prompt_with_page_content,
    generate_related_work_prompt
)
from citegeist.utils import AzureClient
from citegeist.utils import (
    get_arxiv_abstract,
    get_arxiv_citation,
    process_arxiv_paper_with_embeddings,
    find_most_relevant_pages,
)
from citegeist.utils import (
    extract_most_relevant_pages,
    select_diverse_papers_with_weighted_similarity,
)
from dotenv import load_dotenv
import os

load_dotenv()

topic_model = BERTopic.load("MaartenGr/BERTopic_ArXiv")
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
client = MilvusClient("./database.db")
prompting_client = AzureClient(
    endpoint=os.getenv("AZURE_ENDPOINT"),
    deployment_id=os.getenv("AZURE_PROMPTING_MODEL"),
    api_key=load_api_key(os.getenv("KEY_LOCATION")),
)

In [None]:
abstract = "Large Language Models have shown impressive per- formance across a wide array of tasks involving both structured and unstructured textual data. More recently, adaptions of these models have drawn attention to their abilities to work with code across different programming languages. On this notion, different benchmarks for code generation, repair, or completion suggest that certain models have programming abilities comparable to or even surpass humans. In this work, we demonstrate that the performance on this benchmark does not translate to the innate ability of humans to appreciate the structural control flow of code. For this purpose, we extract code solutions from the Hu- manEval benchmark, which the relevant models perform very strongly on, and trace their execution path using function calls sampled from the respective test set. Using this dataset, we investigate the ability of 5 state-of-the-art LLMs to match the execution trace and find that, despite the model’s abilities to generate semantically identical code, they possess only limited ability to trace the execution path, especially for traces with increased length. We find that even the top-performing model, Gemini 1.5 Pro can only fully correctly generate the trace of 47% of HumanEval tasks. In addition, we introduce a specific subset for three key structures not, or only contained to a limited extent in Hu- manEval: Recursion, Parallel Processing, and Object Oriented Programming principles, including concepts like Inheritance and Polymorphism. Besides OOP, we show that none of the investigated models achieve an average accuracy of over 5% on the relevant traces. Aggregating these specialized parts with the ubiquitous HumanEval tasks, we present the Benchmark CoCoNUT: Code Control Flow for Navigation Understanding and Testing, which measures a models ability to trace the execu- tion of code upon relevant calls, including advanced structural components. We conclude that the current generation LLMs still need to significantly improve to enhance their code reasoning abilities. We hope our dataset can help researchers bridge this gap in the near future."
embedded_abstract = embedding_model.encode(abstract)
topic = topic_model.transform(abstract)
topic_id = topic[0][0]

res = client.search(
    collection_name="abstracts",
    data=[embedded_abstract],
    limit=60,
    anns_field="embedding",
    # filter = f'topic == {topic_id}',
    search_params={"metric_type": "COSINE", "params": {}},
    output_fields=["embedding"],
)
formatted_res = json.dumps(res, indent=4)
print(formatted_res)
print(len(res[0]))

In [3]:
# we need to remove the best match because that's the same input paper (this only has to be done for papers that are already in the arxiv corpus)
# res = res[0][1:]

res = res[0]

In [4]:
reference = res[:10]

In [8]:
for obj in res:
    obj["embedding"] = obj["entity"]["embedding"]
    obj["entity"] = ""

In [10]:
output = select_diverse_papers_with_weighted_similarity(res, 10, 0.35)

In [None]:
original_top_10 = [obj["id"] for obj in res][:10]
print(original_top_10)

differences = set(output) - set(original_top_10)
print(differences)

In [11]:
paper_embeddings = []
for paper in output:
    arxiv_id = paper["id"]  # Replace with the actual paper ID key in your JSON

    print(f"Processing paper: {arxiv_id}")
    result = process_arxiv_paper_with_embeddings(arxiv_id, topic_model)

    if result:
        paper_embeddings.append(result)
        print(f"Paper {arxiv_id}: Processed successfully.")
    else:
        print(f"Paper {arxiv_id}: No content remains after filtering.")

# Print an example: First page text and embedding of the first processed paper
if paper_embeddings:
    print("First paper, first page text:", paper_embeddings[0][0]["text"])
    print("First paper, first page embedding:", paper_embeddings[0][0]["embedding"])

relevant_pages = extract_most_relevant_pages(
    paper_embeddings, abstract, topic_model, 60
)

Processing paper: 2408.10718
PDF downloaded successfully: 2408.10718.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Paper 2408.10718: Processed successfully.
Processing paper: 2309.15432
PDF downloaded successfully: 2309.15432.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Paper 2309.15432: Processed successfully.
Processing paper: 2406.15877
PDF downloaded successfully: 2406.15877.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Paper 2406.15877: Processed successfully.
Processing paper: 2403.19114
PDF downloaded successfully: 2403.19114.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Paper 2403.19114: Processed successfully.
Processing paper: 2305.12138
PDF downloaded successfully: 2305.12138.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Paper 2305.12138: Processed successfully.
Processing paper: 2309.01940
PDF downloaded successfully: 2309.01940.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Paper 2309.01940: Processed successfully.
Processing paper: 2408.13001
PDF downloaded successfully: 2408.13001.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Paper 2408.13001: Processed successfully.
Processing paper: 2403.04811
PDF downloaded successfully: 2403.04811.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Paper 2403.04811: Processed successfully.
Processing paper: 2407.19055
PDF downloaded successfully: 2407.19055.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Paper 2407.19055: Processed successfully.
Processing paper: 2306.09896
PDF downloaded successfully: 2306.09896.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Paper 2306.09896: Processed successfully.
First paper, first page text: CodeJudge-Eval: Can Large Language Models be Good Judges in
Code Understanding?
♢Yuwei Zhao* , ♠Ziyang Luo* , ♡Yuchen Tian , ♠Hongzhan Lin
♣Weixiang Yan , ♢Annan Li , ♠Jing Ma†
♠Hong Kong Baptist University, ♢Beihang University
♡University of Tokyo, ♣Vaneval.AI
{yuweizhao,liannan}@buaa.edu.cn
{cszyluo,majing}@comp.hkbu.edu.hk
Abstract
Recent advancements in large language models
(LLMs) have showcased impressive code gener-
ation capabilities, primarily evaluated through
language-to-code benchmarks. However, these
benchmarks may not fully capture a model’s
code understanding abilities.
We introduce
CodeJudge-Eval (CJ-Eval), a novel bench-
mark designed to assess LLMs’ code under-
standing abilities from the perspective of code
judging rather than code generation. CJ-Eval
challenges models to determine the correctness
of provided code solutions, encompassing var-
ious error types and compilation issues. By
leveraging

In [22]:
abstracts = [get_arxiv_abstract(obj["id"]) for obj in output]
top_relevant_pages = find_most_relevant_pages(relevant_pages, abstracts, 10)

In [25]:
for key, obj in top_relevant_pages.items():
    arxiv_id = res[key]["id"]
    arxiv_abstract = obj["abstract"]
    text_segments = obj["text"]
    response: str = prompting_client.get_completions(
        generate_summary_prompt_with_page_content(
            abstract, arxiv_abstract, text_segments
        ),
        os.getenv("AZURE_PROMPTING_MODEL_VERSION"),
    )
    obj["summary"] = response
    obj["citation"] = get_arxiv_citation(arxiv_id)

In [26]:
data = list(top_relevant_pages.values())
print(generate_related_work_prompt(abstract, data))


    I am working on a research paper, and I need a well-written "Related Work" section. Below I'm providing you with the abstract of the paper I'm writing and a list of summaries of related works I've identified.
    
    Here's the abstract of my paper:
    "Large Language Models have shown impressive per- formance across a wide array of tasks involving both structured and unstructured textual data. More recently, adaptions of these models have drawn attention to their abilities to work with code across different programming languages. On this notion, different benchmarks for code generation, repair, or completion suggest that certain models have programming abilities comparable to or even surpass humans. In this work, we demonstrate that the performance on this benchmark does not translate to the innate ability of humans to appreciate the structural control flow of code. For this purpose, we extract code solutions from the Hu- manEval benchmark, which the relevant models perform ver

In [27]:
response: str = prompting_client.get_completions(
    generate_related_work_prompt(abstract, data),
    os.getenv("AZURE_PROMPTING_MODEL_VERSION"),
)
print(response)

### Related Work

The exploration of large language models (LLMs) in the domain of code understanding and generation has been a focal point of recent research, with various studies highlighting both the potential and limitations of these models. A significant body of work has concentrated on developing benchmarks to evaluate LLMs' code reasoning abilities beyond mere code generation. Zhao et al. (2024) introduced CodeJudge-Eval, a benchmark that assesses LLMs' capacity to judge the correctness of code solutions, emphasizing the gap between code generation and deeper code reasoning abilities. This aligns with our research, which critiques the current benchmarks like HumanEval for not adequately assessing LLMs' understanding of code execution paths and structural control flow. Similarly, Song et al. (2024) with CodeApex, and Zheng et al. (2024) have underscored the need for comprehensive evaluation methods that address the limitations of existing benchmarks, echoing our findings on the n

In [None]:
print([obj["citation"] for obj in top_relevant_pages.values()])