# Data and Library Imports

In [1]:
# Imports 
from dotenv import load_dotenv
load_dotenv()
import os
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_KEY")
# this needs to be set before other llamaindex imports and instantiations

In [2]:
import json
import numpy as np
import pandas as pd

from llama_index.readers.file import PDFReader

from llama_index.core import (
    SimpleDirectoryReader, 
    VectorStoreIndex, 
    Settings, 
    get_response_synthesizer, 
    PromptTemplate
)
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.postprocessor import SimilarityPostprocessor

from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.core.vector_stores.types import (
    MetadataFilter,
    MetadataFilters,
    ExactMatchFilter
)

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

import time
import random

In [3]:
# Do some settings for the RAG

# Setup the embedding model
Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-small", 
    api_key=os.getenv("OPENAI_KEY"), 
)

# Setup the LLM
Settings.llm = OpenAI(
    model="gpt-4-turbo", 
    temperature=1, 
    api_key=os.getenv("OPENAI_KEY"), 
    max_tokens=250
)


In [4]:
# Load the jargon annotated data
merged_humam_llm_jargon = pd.read_json(
    "data/llm_outputs/240525_march_2024_human_llm_jargon_merged.json", 
    orient="index"
)

merged_humam_llm_jargon.head()

Unnamed: 0,arxiv_id,reader_id,gpt4_jargon_list,human_jargon_list
0,2403.16190v1,rid0,"reject option strategy, formal guarantees, min...","correctness,minimality,Anchors"
1,2403.16190v1,rid1,"linear classification problems, reject option ...","reject option strategy,Anchors,heuristic algor..."
2,2307.05300v4,rid0,"cognitive synergist, fine-grained personas, fa...","multi-turn,persona"
3,2307.05300v4,rid1,"cognitive synergy, cognitive synergist, multi-...","Solo Performance Prompting,multi-turn,Chain-of..."
4,2403.16750v1,rid0,"Common Weakness Enumerations (CWEs), SystemVer...","common weakeness enumerations,SystemVerilog,Re..."


In [5]:
# Read in the arXiv metadata
with open("data/arxiv_metadata/filtered/march_2024_ai_hc_cy_peer_reviewed_sampled.json") as json_data:
    metadata = json.load(json_data)
    json_data.close()

# Convert JSON to DataFrame
metadata_df = pd.DataFrame.from_dict(metadata, orient='index')

metadata_df.head()

Unnamed: 0,arxiv_id,url,title,summary,updated,published,authors,comments,categories,primary_category,doi,journal_ref,peer_reviewed
0,2403.16190v1,http://arxiv.org/abs/2403.16190v1,Logic-based Explanations for Linear Support Ve...,Support Vector Classifier (SVC) is a well-know...,1711293284000,1711293284000,"[Francisco Mateus Rocha Filho, Thiago Alves Ro...","16 pages, submitted to BRACIS 2023 (Brazilian ...","[cs.AI, cs.LG, cs.LO, I.2.4; I.2.6]",cs.AI,10.1007/978-3-031-45368-7_10,,True
1,2307.05300v4,http://arxiv.org/abs/2307.05300v4,Unleashing the Emergent Cognitive Synergy in L...,Human intelligence thrives on cognitive synerg...,1711463553000,1689086719000,"[Zhenhailong Wang, Shaoguang Mao, Wenshan Wu, ...",Accepted as a main conference paper at NAACL 2024,"[cs.AI, cs.CL]",cs.AI,,,True
2,2403.16750v1,http://arxiv.org/abs/2403.16750v1,"All Artificial, Less Intelligence: GenAI throu...",Modern hardware designs have grown increasingl...,1711373004000,1711373004000,"[Deepak Narayan Gadde, Aman Kumar, Thomas Nala...",Published in DVCon U.S. 2024,[cs.AI],cs.AI,,,True
3,2311.10112v2,http://arxiv.org/abs/2311.10112v2,zrLLM: Zero-Shot Relational Learning on Tempor...,Modeling evolving knowledge over temporal know...,1710517087000,1700083515000,"[Zifeng Ding, Heling Cai, Jingpei Wu, Yunpu Ma...",Accepted to NAACL 2024 main conference,"[cs.AI, cs.CL, cs.LG]",cs.AI,,,True
4,2310.08992v3,http://arxiv.org/abs/2310.08992v3,CodeChain: Towards Modular Code Generation Thr...,Large Language Models (LLMs) have already beco...,1710386949000,1697192268000,"[Hung Le, Hailin Chen, Amrita Saha, Akash Goku...",Accepted to ICLR 2024,"[cs.AI, cs.CL, cs.PL]",cs.AI,,,True


In [6]:
# Light text cleaning so that it can be proprely fed into anything
metadata_df['summary'] = metadata_df['summary'].str.replace('\n', ' ')
metadata_df['summary'] = metadata_df['summary'].str.replace('\r', ' ')
metadata_df['summary'] = metadata_df['summary'].str.replace('\t', ' ')
metadata_df['summary'] = metadata_df['summary'].str.replace('  ', ' ')
metadata_df['summary'] = metadata_df['summary'].str.strip()

metadata_df['title'] = metadata_df['title'].str.replace('\n', ' ')
metadata_df['title'] = metadata_df['title'].str.replace('\r', ' ')
metadata_df['title'] = metadata_df['title'].str.replace('\t', ' ')
metadata_df['title'] = metadata_df['title'].str.replace('  ', ' ')
metadata_df['title'] = metadata_df['title'].str.strip()


# RAG: Loading

In [None]:
# PDF Reader with `SimpleDirectoryReader`
parser = PDFReader()
file_extractor = {".pdf": parser}
filename_fn = lambda filename: {"file_name": filename}
documents = SimpleDirectoryReader(
    "data/arxiv_pdfs/march_2024_ai_hc_cy_peer_reviewed_sampled", 
    file_extractor=file_extractor, file_metadata=filename_fn
).load_data()


In [None]:
# Length of documents is much much more than length of the aactual number of papers because LlamaIndex stores
# each page of the paper as a separate document
print(len(documents))

In [None]:
# Add some metadata fields to each document: arxiv_id, title, abstract, and primary category
for document in documents:
    document.metadata["arxiv_id"] = document.metadata["file_name"].split("/")[-1][:-4]
    # document.metadata["title"] = metadata_df.loc[
    #     metadata_df["arxiv_id"] == document.metadata["arxiv_id"]]['title'].values[0]
    # document.metadata["summary"] = metadata_df.loc[
    #     metadata_df["arxiv_id"] == document.metadata["arxiv_id"]]['summary'].values[0]
    # document.metadata["primary_category"] = metadata_df.loc[
    #     metadata_df["arxiv_id"] == document.metadata["arxiv_id"]]['primary_category'].values[0]
    

You do NOT want the ID or the title or the abstract getting embedded in any way at all. It would either add extraneous info (ID) or confuse the jargon term lookup (title/abstract). Only want to use the full text of the paper to define the jargon terms.

In [None]:
# Hide the arxiv_id from being visible to the embedding model and LLM (during response synthesis). 
print("Hidden metadata: ", documents[0].excluded_embed_metadata_keys)

for document in documents:
    document.excluded_llm_metadata_keys.append("arxiv_id")
    document.excluded_embed_metadata_keys.append("arxiv_id")
print("Hidden metadata -- updated: ", documents[0].excluded_embed_metadata_keys)

In [None]:
# A preivew of what the LLM and the embedding models see
from llama_index.core.schema import MetadataMode
print(
    "The LLM sees this: \n",
    documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "\n\n\nThe Embedding model sees this: \n",
    documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)

# RAG: Transformation/Indexing

Chunking + Embedding Chunks

In [None]:
# # Setup client
# client = OpenAI(
#     api_key = os.getenv("OPENAI_KEY"),
# )

In [None]:
%%time

# Create an index
index = VectorStoreIndex.from_documents(
    documents, 
    transformations=[
        SentenceSplitter(chunk_size=256, chunk_overlap=20), # I want to chunk at the sentence level, since term definitions are usually in a single sentence
        OpenAIEmbedding(
            model="text-embedding-3-small", 
            api_key=os.getenv("OPENAI_KEY")
        ) # Use the OpenAI embedding model
    ]
)

In [None]:
%%time

# Read more here about load this back in: https://docs.llamaindex.ai/en/stable/understanding/storing/storing/
index.storage_context.persist(persist_dir="data/vector_indices")

# Load the Index in 

In [8]:
%%time

from llama_index.core import StorageContext, load_index_from_storage

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="data/vector_indices")

# load index
index_loaded = load_index_from_storage(
    storage_context, 
    transformations=[
        SentenceSplitter(chunk_size=256, chunk_overlap=20), # I want to chunk at the sentence level, since term definitions are usually in a single sentence
        OpenAIEmbedding(
            model="text-embedding-3-small", 
            api_key=os.getenv("OPENAI_KEY")
        ) # Use the OpenAI embedding model
    ]
)

CPU times: user 30.9 s, sys: 141 ms, total: 31.1 s
Wall time: 31.2 s


# Querying -- Trials

In [None]:
# %%time

# Create a basic engine for testing
query_engine = index_loaded.as_query_engine()
response = query_engine.query("define: reject option strategy")
print(response)


In [18]:
%%time

# Create a more complex engine for testing

# Create a filter on the arxiv_id
filters = MetadataFilters(filters=[
    ExactMatchFilter(
        key="arxiv_id", 
        value="2307.05300v4"
    )
])

# Configure retriever
retriever = VectorIndexRetriever(
    index=index_loaded,
    similarity_top_k=10,
    filters=filters
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="compact",
)

# assemble query engine
custom_query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.3)]
)


CPU times: user 748 µs, sys: 1.63 ms, total: 2.38 ms
Wall time: 2.44 ms


In [19]:
# Accessing the default prompt
response_synthesizer.get_prompts()['text_qa_template']#['template']

SelectorPromptTemplate(metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings={}, function_mappings={}, default_template=PromptTemplate(metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='Context information is below.\n---------------------\n{context_str}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {query_str}\nAnswer: '), conditionals=[(<function is_chat_model at 0x1332f5820>, ChatPromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, message_templates=[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, content="You are an expert

In [None]:
%%time

response = custom_query_engine.query(
    "Use 1-2 sentences to explain this term so that even a reader without deep scientific and technical knowledge can understand it easily: cognitive synergist."
)
print(response)


In [None]:
%%time
from openai import OpenAI as oai_real

# Setup client
client = oai_real(
    api_key = os.getenv("OPENAI_KEY"),
)

abstract = "Human intelligence thrives on cognitive synergy, where collaboration among different minds yield superior outcomes compared to isolated individuals. In this work, we propose Solo Performance Prompting (SPP), which transforms a single LLM into a cognitive synergist by engaging in multi-turn self-collaboration with multiple personas. A cognitive synergist is an intelligent agent that collaboratively combines multiple minds' strengths and knowledge to enhance problem-solving in complex tasks. By dynamically identifying and simulating different personas based on task inputs, SPP unleashes the potential of cognitive synergy in LLMs. Our in-depth analysis shows that assigning multiple fine-grained personas in LLMs improves problem-solving abilities compared to using a single or fixed number of personas. We evaluate SPP on three challenging tasks: Trivia Creative Writing, Codenames Collaborative, and Logic Grid Puzzle, encompassing both knowledge-intensive and reasoning-intensive types. Unlike previous works, such as Chain-of-Thought, that solely enhance the reasoning abilities in LLMs, experimental results demonstrate that SPP effectively reduces factual hallucination, and maintains strong reasoning capabilities. Additionally, comparative experiments show that cognitive synergy only emerges in GPT-4 and does not appear in less capable models, such as GPT-3.5-turbo and Llama2-13b-chat, which draws an interesting analogy to human development. Code, data, and prompts can be found at: this https URL. "
term = "cognitive synergist"
term_definition = client.chat.completions.create(
                    model="gpt-4-turbo",
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": f"Here's a scientific abstract: {abstract}. Based on this information, please use 1-2 sentences to explain this term so that even a reader without deep scientific and technical knowledge can understand it easily: {term}. \n Definition: "}
                    ],
                    temperature=1, 
                    seed=10,
                )
term_definition = term_definition.choices[0].message.content
print(term_definition)

# Querying -- Actual

In [None]:
# Create a custom prompt template -- just to maximise control over everything for now
# template = (
#     "Here are some relevant excerpts from a scientific paper:"
#     "---------------------\n"
#     "{context_str}"
#     "\n---------------------\n"
#     "Based on this information, please {query_str}\n"
#     "Definition: "
# )
# qa_template = PromptTemplate(template)

# Here's the default template:
# Source: https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/prompts/default_prompts.py

# DEFAULT_TEXT_QA_PROMPT_TMPL = (
#     "Context information is below.\n"
#     "---------------------\n"
#     "{context_str}\n"
#     "---------------------\n"
#     "Given the context information and not prior knowledge, "
#     "answer the query.\n"
#     "Query: {query_str}\n"
#     "Answer: "
# )

# response_synthesizer = get_response_synthesizer(
#             response_mode="compact",
#             text_qa_template=qa_template
#         )
# # Accessing the new prompt
# response_synthesizer.get_prompts()['text_qa_template']#['template']

I decided to get the baseline prompts as close to the RAG prompt as possible, rather than twiddling too much with the RAG prompt.

In [13]:
%%time

# Metadata and definitions
metadata_jargon_defs_rag_for_pickling = []
metadata_jargon_defs_rag_for_json = []

# Iterate over articles, the reader, the jargon within
for idx, row in merged_humam_llm_jargon.iterrows():

    # Get list of terms
    arxiv_id = row['arxiv_id']
    reader_id = row['reader_id']
    human_jargon_terms = row['human_jargon_list']

    # Sometimes the lists are empty, in that case, no definitions need generation
    if human_jargon_terms:
        # Strip out any spaces, make sure it's a list format + delete any empty strings
        human_jargon_terms = [i.strip() for i in human_jargon_terms.split(',') if i.strip()]
    
        # Create a metadata filter for the query engine based on the arxiv_id
        filters = MetadataFilters(filters=[
            ExactMatchFilter(
                key="arxiv_id", 
                value=arxiv_id
            )
        ])

        # Configure retriever
        retriever = VectorIndexRetriever(
            index=index_loaded,
            similarity_top_k=10,
            filters=filters
        )
        
        # configure response synthesizer + remember to initialise with the custom prompt
        response_synthesizer = get_response_synthesizer(
            response_mode="compact",
            # text_qa_template=qa_template
        )

        # assemble query engine
        custom_query_engine = RetrieverQueryEngine(
            retriever=retriever,
            response_synthesizer=response_synthesizer,
            node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.3)]
        )
        
        # iterate over the jargon terms and call
        for term in human_jargon_terms:
                # call for definition
                term_definition = custom_query_engine.query(
                    f"Please use 1-2 sentences to explain the following term so that even a reader without deep scientific and technical knowledge can understand it easily: {term}.")
    
                # Add it in
                metadata_jargon_defs_rag_for_pickling.append({
                    'arxiv_id': arxiv_id, 
                    'reader_id': reader_id, 
                    'human_jargon_term': term, 
                    'definition_object': term_definition, 
                    'definition_text': term_definition.response
                })
    
                metadata_jargon_defs_rag_for_json.append({
                    'arxiv_id': arxiv_id, 
                    'reader_id': reader_id, 
                    'human_jargon_term': term, 
                    'definition_text': term_definition.response
                })

                # print(arxiv_id, reader_id, term_definition.response)
    
                time.sleep(random.uniform(0, 1))



CPU times: user 21.6 s, sys: 891 ms, total: 22.5 s
Wall time: 26min 13s


In [15]:
# metadata_jargon_defs_rag_for_json

In [16]:
# Save to JSON and pickle respectively
import pickle

# Today's date for the filename
from datetime import datetime
today = datetime.today().strftime('%Y%m%d')[2:]

with open(f'data/llm_outputs/{today}_march_2024_sampled_jargon_definitions_rag.json', 'w') as file:
    json.dump(metadata_jargon_defs_rag_for_json, file, indent=4)

# Open a file and use dump() 
with open(f'data/llm_outputs/{today}_march_2024_sampled_jargon_definitions_objects_rag.pkl', 'wb') as file: 
    pickle.dump(metadata_jargon_defs_rag_for_pickling, file) 


In [17]:
from openai import OpenAI as oai_real

# Setup client
client = oai_real(
    api_key = os.getenv("OPENAI_KEY"),
)

In [22]:
%%time

# Get baselines for GPT-4
metadata_jargon_defs_abstract = []

# Merge w/ abstracts to add those as metadata
merged_humam_llm_jargon_abstracts = pd.merge(merged_humam_llm_jargon, metadata_df[['arxiv_id', 'summary']], on='arxiv_id', how='inner')

# Iterate over articles, the reader, the jargon within
for idx, row in merged_humam_llm_jargon_abstracts.iterrows():

    # Get list of terms
    arxiv_id = row['arxiv_id']
    reader_id = row['reader_id']
    abstract = row['summary']
    human_jargon_terms = row['human_jargon_list']

    # Sometimes the lists are empty, in that case, no definitions need generation
    if human_jargon_terms:

        # Strip out any spaces, make sure it's a list format
        human_jargon_terms = [i.strip() for i in human_jargon_terms.split(',')]

        # iterate over the jargon terms and call
        for term in human_jargon_terms:

            if len(term)>2:

                # call for definition
                term_definition = client.chat.completions.create(
                    model="gpt-4-turbo",
                    messages=[
                        {"role": "system", "content": "You are an expert Q&A system that is trusted around the world.\nAlways answer the query using the provided context information, and not prior knowledge.\nSome rules to follow:\n1. Never directly reference the given context in your answer.\n2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines."},
                        {"role": "user", "content": f"Here's a scientific abstract: {abstract}. Given the context information and not prior knowledge, please use 1-2 sentences to explain this term so that even a reader without deep scientific and technical knowledge can understand it easily: {term}. \n Definition: "}
                    ],
                    temperature=1, 
                    seed=10,
                )
                term_definition = term_definition.choices[0].message.content
                
                metadata_jargon_defs_abstract.append({
                    'arxiv_id': arxiv_id, 
                    'reader_id': reader_id, 
                    'human_jargon_term': term, 
                    'definition_text': term_definition
                })

                # print(arxiv_id, reader_id, term_definition)
    
                time.sleep(random.uniform(0, 1))
        

CPU times: user 5.79 s, sys: 153 ms, total: 5.94 s
Wall time: 24min 20s


In [25]:
# metadata_jargon_defs_abstract

In [24]:
with open(f'data/llm_outputs/{today}_march_2024_sampled_jargon_definitions_abstract.json', 'w') as file:
    json.dump(metadata_jargon_defs_abstract, file, indent=4)

In [26]:
%%time

# Get baselines for GPT-4 -- no context case
metadata_jargon_defs_nocontext = []

# Iterate over articles, the reader, the jargon within
for idx, row in merged_humam_llm_jargon.iterrows():

    # Get list of terms
    arxiv_id = row['arxiv_id']
    reader_id = row['reader_id']
    human_jargon_terms = row['human_jargon_list']

    # Sometimes the lists are empty, in that case, no definitions need generation
    if human_jargon_terms:

        # Strip out any spaces, make sure it's a list format
        human_jargon_terms = [i.strip() for i in human_jargon_terms.split(',')]

        # iterate over the jargon terms and call
        for term in human_jargon_terms:

            if len(term)>2:

                # call for definition
                term_definition = client.chat.completions.create(
                    model="gpt-4-turbo",
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": f"Please use 1-2 sentences to explain this term so that even a reader without deep scientific and technical knowledge can understand it easily: {term}. \n Definition: "}
                    ],
                    temperature=1, 
                    seed=10,
                )
                term_definition = term_definition.choices[0].message.content
                
                metadata_jargon_defs_nocontext.append({
                    'arxiv_id': arxiv_id, 
                    'reader_id': reader_id, 
                    'human_jargon_term': term, 
                    'definition_text': term_definition
                })
    
                time.sleep(random.uniform(0, 1))
        

CPU times: user 5.48 s, sys: 155 ms, total: 5.63 s
Wall time: 22min 37s


In [29]:
# metadata_jargon_defs_nocontext

In [28]:
with open(f'data/llm_outputs/{today}_march_2024_sampled_jargon_definitions_nocontext.json', 'w') as file:
    json.dump(metadata_jargon_defs_nocontext, file, indent=4)