In [1]:
import nest_asyncio
from r2r import R2RClient, R2RException

nest_asyncio.apply()

client = R2RClient(
    base_url='http://localhost:7272',
    timeout=600
)

In [2]:
with open(file="./questions.txt", mode="r", encoding="utf-8") as f:
    questions = f.readlines()
    questions = [q.strip() for q in questions]

In [3]:
search_settings = {
    "use_semantic_search": True,
    "limit": 3,
    "offset": 0,
    "include_metadatas": True,
    "include_scores": True,
    "search_strategy": "vanilla",
}

rag_generation_config = {
    "temperature": 0.1,
    "top_p": 1,
    "max_tokens_to_sample": 512
}

In [4]:
template = """
## Task:
 
Answer the query given below using the provided context. Keep your answer very short and concise!
     
 - Aim to answer in 1-2 sentences whenever possible
 - If a longer answer is needed, make it as concise as possible focusing on the relevant
 - For step-by-step guides, use numbered steps with each step on a new line
 - If there're multiple answers, use numbered steps with each step on a new line
 - DO NOT use line item references to the context
 - If there is no context available locally to answer, inform the user of insufficient information
 - NEVER provide an answer if there's no context that discusses it
 - NEVER reason about a possible answer! If no context can answer the query there should be NO answer
 
 ### Query:
 
 {query}
 
 
 ### Context:
 
 {context}
 
 
 ### Query:
 
 {query}
 
 
 # Reminder: Provide short and concise answers and NEVER answer something that is not in the provided context!
 
 ## Response:
 """

In [5]:
def summarize_ctx_template(context: str) -> str:
    summarize_prompt = f"""
    Summarize the following context while preserving all key information needed to answer future questions:
    
    {context}
    
    Provide a concise summary that includes all essential facts, data points, and information.
    Try to stay under 4 sentences. Only provide the summary and no further explanation or details.
    Don't mention things like: Here is a concise summary of the key information.
    """
    return summarize_prompt

In [None]:
import re
import ollama
import pandas as pd

# User question, context retrieved, actual answer, and LLM answer
df = pd.DataFrame(columns=["question", "retrieved_context", "generated_response", "reference_answer"])

for i, q in enumerate(questions):
    try:
        # Submit a user query
        resp = client.retrieval.rag(
            query = q,
            search_mode = "custom",
            search_settings = search_settings,
            rag_generation_config = rag_generation_config,
            task_prompt_override = template,
            include_title_if_available = True
        ).results
        
        # After getting the response summarize the context
        full_ctx = "\n".join([re.sub(r"\n+", "\n", chunk.text) for chunk in resp.search_results.chunk_search_results])
        
        # Generate the summary by using the LLM
        summary_ctx = ollama.generate(
            model="llama3.1",
            prompt = summarize_ctx_template(full_ctx),
            options = {
                "temperature": 0.1,
                "num_predict": 512
            }
        )['response']
        
        llm_asnwer = resp.completion

        # Save on each iteration
        df.loc[len(df)] = [q, summary_ctx, llm_asnwer, None]
        
        print(f"Question {i+1} out of {len(questions)}")
    except R2RException as r2re:
        print(f"Skipping {i+1} because of {str(r2re)}")
    except ollama.ResponseError | ollama.RequestError as oe:
        print(f"Skipping {i+1} because of {str(oe)}")
    except Exception as e:
        print(f"Skipping {i+1} because of {str(e)}")
    
# Finally, save data to disk
df.to_csv("dataset.csv", index=False)


Question 1 out of 51
Question 2 out of 51
Question 3 out of 51
Question 4 out of 51
Question 5 out of 51
Question 6 out of 51
Question 7 out of 51
Question 8 out of 51
Question 9 out of 51
Question 10 out of 51
Question 11 out of 51
Question 12 out of 51
Question 13 out of 51
Question 14 out of 51
Question 15 out of 51
Question 16 out of 51
Question 17 out of 51
Question 18 out of 51
Question 19 out of 51
Question 20 out of 51
Question 21 out of 51
Question 22 out of 51
Question 23 out of 51
Question 24 out of 51
Question 25 out of 51
Question 26 out of 51
Question 27 out of 51
Question 28 out of 51
Question 29 out of 51
Question 30 out of 51
Question 31 out of 51
Question 32 out of 51
Question 33 out of 51
Question 34 out of 51
Question 35 out of 51
Question 36 out of 51
Question 37 out of 51
Question 38 out of 51
Question 39 out of 51
Question 40 out of 51
Question 41 out of 51
Question 42 out of 51
Question 43 out of 51
Question 44 out of 51
Question 45 out of 51
Question 46 out of 