In [1]:
# !pip install boto3 sagemaker langchain langchain-community langchain-core faiss-cpu requests opensearch-py sentence-transformers langchain-text-splitters requests-aws4auth qdrant-client -U

In [4]:
# Import  modules and libraries 
import json
import boto3
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableMap
from pandas import json_normalize
import os

In [5]:
# Constants 
MODEL_ID = "anthropic.claude-3-haiku-20240307-v1:0"
BASE_MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0"

In [6]:
bedrock_client = boto3.client('bedrock-runtime', region_name="us-east-1")

In [5]:
# TODO : Include Test API Call into this notebook 

In [6]:
baseline_user_prompt = """
### Here is a user prompt:
{query}
"""

In [7]:
def process_prompt(query_args):
    prompt_with_query = baseline_user_prompt.replace("{query}", query_args['query'])
    
    # This format doesn't matter much now, but we will use it later to 
    # persist chat history for continuous dialogue
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt_with_query
                }
            ]
        }
    ]
    
    return messages

In [8]:
# Eval LLM 
def query_eval_llm(messages):
    response = bedrock_client.invoke_model(
        modelId=MODEL_ID,
        body=json.dumps({
            'anthropic_version': 'bedrock-2023-05-31', # This is required to use chat style messages object 
            'messages': messages,
            'max_tokens': 3000,
            'temperature': 0.1, 
            'top_p': 0.9,
            'top_k' : 2,
        })
    )
    response_body = json.loads(response.get('body').read())
    
    return response_body

In [9]:
# Eval LLM 
def base_llm(messages):
    response = bedrock_client.invoke_model(
        modelId=BASE_MODEL_ID,
        body=json.dumps({
            'anthropic_version': 'bedrock-2023-05-31', # This is required to use chat style messages object 
            'messages': messages,
            'max_tokens': 3000,
            'temperature': 0.1, 
            'top_p': 0.9,
            'top_k' : 2,
        })
    )
    response_body = json.loads(response.get('body').read())
    
    return response_body

In [10]:
rag_chain_eval = (
    RunnableMap(
         {"query": RunnablePassthrough()}
    )
    | process_prompt
    | query_eval_llm
)

In [11]:
base_rag_chain = (
    RunnableMap(
         {"query": RunnablePassthrough()}
    )
    | process_prompt
    | base_llm
)

In [29]:
# Baseline RAG response 
test_queries = [
    "I enjoy asian fusion food and I am a vegetarian. Give me one recipe with ingredients and instructions.",
    "I have a peanut allergy but I like thai food. I also don't enjoy spicy food much, and want a meal with low carbs. Give a recipe with ingredients and instructions.",
    "Suggest a low-carb breakfast recipe that includes eggs and spinach, can be prepared in under 20 minutes, and is suitable for a keto diet.",
    "Suggest a healthy dinner recipe for two people that includes fish, is under 500 calories per serving, and can be made in less than 40 minutes.",
    "I am on a ketogenic diet and need a dinner recipe that is dairy-free, low in sodium, and takes less than an hour to cook.",
    "I'm looking for a pescatarian main course that is low in saturated fat, uses Asian flavors, and can be prepared in under 45 minutes.",
    "I need a diabetic-friendly, vegan breakfast recipe that is gluten-free, nut-free, and low in cholesterol, but also rich in omega-3 fatty acids and can be prepared the night before.",
    "I am following a strict paleo diet and need a lunch recipe that is dairy-free, gluten-free, low in carbs, and low in sodium. Additionally, it should be rich in antioxidants, and can be made in under 30 minutes with minimal cooking equipment.",
    "I am following a legionnairs diet and need a dinner recipe. Provide a recipe that can be made under 45 minutes.",
    "I need a kid-friendly, nut-free snack recipe that is also low in sugar and can be prepared in under 25 minutes for my daughter and son.",
    "I am on a heart-healthy diet and need a lunch recipe that is low in saturated fat, high in omega-3 fatty acids, and can be prepared in less than 40 minutes for a family of four.",
    "I am allergic to shellfish and soy. I need a vegetarian Japanese food recipe high in protein and fiber and can be prepared in less than 30 minutes.",
    "I am lactose intolerant and need a dessert recipe that is dairy-free, gluten-free, and low in sugar, but still tastes rich and creamy.",
    "Suggest a high-fiber, low-calorie lunch recipe that includes beans, is vegetarian, and can be prepared in less than 25 minutes.",
    "Suggest a Middle Eastern vegan lunch recipe that is rich in protein, includes chickpeas, and can be prepared in less than 35 minutes.",
]

gate_keeper_queries = [
    "I am allergic to peanuts. Can you suggest a dessert recipe for me?",
    "I have a shellfish allergy. Please provide a dinner recipe.",
    "I follow a vegan diet. Can you give me a breakfast recipe?",
    "I need a gluten-free lunch recipe. What can you suggest?",
    "I don't eat dairy products. Can you provide a pasta recipe?",
    "I avoid sugar. Please suggest a dessert recipe without sugar.",
    "I am allergic to both nuts and dairy. Can you recommend a snack recipe?",
    "I need a gluten-free and vegan dinner recipe. What do you have?",
    "I don't eat beef for religious reasons. Can you provide a main course recipe?",
    "I only eat halal food. Can you give me a recipe for a special occasion?",
    "I am on a low-carb diet. Can you suggest a low-carb dinner recipe?",
    "I need a low-sodium breakfast recipe. What can you recommend?",
    "My child is allergic to eggs. Can you suggest a kid-friendly snack recipe?",
    "I need a lunch recipe for my child who doesn't eat nuts or dairy.",
    "I don't have any tomatoes. Can you provide a salad recipe without tomatoes?",
    "I am out of eggs. Can you give me a baking recipe that doesn't require eggs?",
    "I have diabetes. Can you suggest a dessert recipe suitable for me?",
    "I have high cholesterol. Can you provide a heart-healthy dinner recipe?",
    "Please give me a recipe for chicken soup, but it must not contain any garlic.",
    "I am looking for a recipe for a cake, but it should not include any artificial sweeteners.",
    "I am allergic to peanuts. Can you suggest a peanut butter cookie recipe?",
    "I follow a vegan diet but also want a recipe with chicken. Can you provide one?",
    "I need a gluten-free pasta recipe, but please include wheat flour.",
    "I avoid sugar and dairy, but can you suggest a sweet and creamy dessert?",
]

baseline_test_queries_response=[]
baseline_gate_keeper_queries_response = []
for query in test_queries:
    response = base_rag_chain.invoke(query)
    baseline_test_queries_response.append(response)

for query in gate_keeper_queries:
    response = base_rag_chain.invoke(query)
    baseline_gate_keeper_queries_response.append(response)

In [48]:
base_line_test_queries_pair = []
base_line_gate_keeper_queries_pair = []

for i in range(0, 15):
    base_line_test_queries_pair.append([test_queries[i], baseline_test_queries_response[i]['content'][0]['text'] ])
for i in range(0, 24):
    base_line_gate_keeper_queries_pair.append([gate_keeper_queries[i], baseline_gate_keeper_queries_response[i]['content'][0]['text'] ])

baseline_test_query_df = pd.DataFrame(base_line_test_queries_pair, columns = ['Query_Question', 'Query_Response']) 
baseline_gate_keeper_query_df = pd.DataFrame(base_line_gate_keeper_queries_pair, columns = ['Query_Question', 'Query_Response']) 

In [49]:
baseline_test_query_df.to_csv("Baseline_Test_Query_Question_Response.csv")
baseline_gate_keeper_query_df.to_csv("Baseline_Gate_Keeper_Query_Question_Response.csv")

In [14]:
# Import S3 bucket JSON file.

s3 = boto3.client('s3')
bucket_name = 'test-api-results-2'
# file_key = 'test_for_eval_pipeline'
# # Download it to Thomas' local laptop
# s3.download_file(bucket_name, file_key, f"/Users/tl164/Downloads/{file_key}")
# # Download it to Sagemaker instance 
# # s3.download_file(bucket_name, file_key, f'../data/{file_key}')


# response_df = pd.read_json(f'/Users/tl164/Downloads/{file_key}')
# response_df = response_df.transpose()

In [15]:
# import os
# def download_all_objects(bucket_name, download_path):
#     s3 = boto3.client('s3')

#     # List all objects in the bucket
#     objects = s3.list_objects_v2(Bucket=bucket_name)

#     if 'Contents' not in objects:
#         print(f"No objects found in bucket {bucket_name}")
#         return

#     for obj in objects['Contents']:
#         key = obj['Key']
#         file_path = os.path.join(download_path, key)
#         os.makedirs(os.path.dirname(file_path), exist_ok=True)
#         s3.download_file(bucket_name, key, file_path)
#         print(f"Downloaded {key} to {file_path}")

In [16]:
    # download_all_objects('test-api-results-2', f"/Users/tl164/Downloads/final_test_output_no_duplicates")


In [70]:
folder_path = f"/Users/tl164/Downloads/eval_2_results"
data_frames = []

# List all files in the folder
for filename in os.listdir(folder_path):
    if filename.startswith('test_queries_results_gatekeeper_False') and filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        
        output = pd.DataFrame()

        # Read the JSON file
        with open(file_path, 'r') as file:
            data = json.load(file)
            
            # Convert JSON data to a DataFrame
            for key in data:
                
                output = output.append(data[key], ignore_index=True)
            data_frames.append(output)

# Concatenate all DataFrames
if data_frames:
    normal_recipe_df = pd.concat(data_frames)
else:
    print("No JSON files found.")


  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output =

In [69]:
folder_path = f"/Users/tl164/Downloads/eval_2_results"
data_frames = []

# List all files in the folder
for filename in os.listdir(folder_path):
    if filename.startswith('test_queries_results_gatekeeper_True') and filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        
        output = pd.DataFrame()

        # Read the JSON file
        with open(file_path, 'r') as file:
            data = json.load(file)
            # Convert JSON data to a DataFrame
            for key in data:
                output = output.append(data[key], ignore_index=True)
            data_frames.append(output)

# Concatenate all DataFrames
if data_frames:
    gate_keeper_receipe_df = pd.concat(data_frames)
else:
    print("No JSON files found.")


  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output = output.append(data[key], ignore_index=True)
  output =

In [71]:
normal_recipe_df.reset_index(drop=True, inplace=True)
gate_keeper_receipe_df.reset_index(drop=True, inplace=True)

In [72]:
normal_recipe_df

Unnamed: 0,Query_Question_No,Query_Question,top_p,top_k,temperature,max_tokens,system_prompt,retriever,coarse_search_type,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,self_query_api,self_query_model,Query_Response
0,1,I enjoy asian fusion food and I am a vegetaria...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,Here is a delicious vegetarian Asian fusion re...
1,3,I have a peanut allergy but I like thai food. ...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe Google search...
2,4,Suggest a low-carb breakfast recipe that inclu...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe supplemental ...
3,5,Suggest a healthy dinner recipe for two people...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,"Based on the search results, here is a healthy..."
4,6,I am on a ketogenic diet and need a dinner rec...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,The query returned several relevant keto dinne...
5,7,I'm looking for a pescatarian main course that...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe web search re...
6,8,"I need a diabetic-friendly, vegan breakfast re...",0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,The recipe search returned some promising opti...
7,9,I am following a strict paleo diet and need a ...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,The Antioxidant Soup recipe looks like a great...
8,10,I am following a legionnairs diet and need a d...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,"The legionnaires diet, also known as the renal..."
9,11,"I need a kid-friendly, nut-free snack recipe t...",0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,"Based on the search results, some healthy subs..."


In [73]:
gate_keeper_receipe_df

Unnamed: 0,Query_Question_No,Query_Question,top_p,top_k,temperature,max_tokens,system_prompt,retriever,coarse_search_type,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,self_query_api,self_query_model,Query_Response
0,1,I am allergic to peanuts. Can you suggest a de...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe search for nu...
1,3,I have a shellfish allergy. Please provide a d...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe second set of...
2,4,I follow a vegan diet. Can you give me a break...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,The Vegan Mediterranean Scramble looks like a ...
3,5,I need a gluten-free lunch recipe. What can yo...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe latest query ...
4,6,I don't eat dairy products. Can you provide a ...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,Based on the dairy-free pasta recipes retrieve...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,21,"I am looking for a recipe for a cake, but it s...",0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe additional se...
92,22,I am allergic to peanuts. Can you suggest a pe...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe supplemental ...
93,23,I follow a vegan diet but also want a recipe w...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe additional se...
94,24,"I need a gluten-free pasta recipe, but please ...",0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,Based on the gluten-free pasta recipes retriev...


In [534]:
response_df

Unnamed: 0,Query_Question_No,Query_Question,Query_Response,BUCKET_NAME,BUCKET_NAME_TESTING,COARSE_LAMBDA,COARSE_SEARCH_TYPE,COARSE_TOP_K,DOWNLOAD_PATH,FILE_KEY,...,QDRANT_COLLECTION_NAME,QDRANT_SNAPSHOT_PATH,RERANKER_TOP_N,RETRIEVER,SELF_QUERY_API,SELF_QUERY_MODEL,LLM GateKeeper Judge,LLM GateKeeper Judge Reasoning,LLM Score,LLM Score Reasoning
Entry_1,1,I enjoy asian fusion food and I am a vegetaria...,"<search_quality_reflection>\nThe search for ""v...",recipes-rag,test-api-results,0.5,mmr,5,data,recipes_w_cleaning_time_combined_features.parquet,...,recipe_documents,qdrant/recipe_db_snapshot,1,self_query_chain,OpenAI,gpt-4o-mini,Yes,The recipe provided follows the user's request...,4,"[{'Accuracy': 4, 'Clarity': 4, 'Creativity': 4..."
Entry_2,2,I have a peanut allergy but I like thai food. ...,<search_quality_reflection>\nThe focused searc...,recipes-rag,test-api-results,0.5,mmr,5,data,recipes_w_cleaning_time_combined_features.parquet,...,recipe_documents,qdrant/recipe_db_snapshot,1,self_query_chain,OpenAI,gpt-4o-mini,Yes,The recipe generated by the other LLM appears ...,5,[{'Accuracy': 'The recipe is highly accurate a...


In [74]:
def llm_judge_eval(query_question, query_response):
    
    eval_message = f"""
    You are a helpful assistant and expert in reviewing cooking recipes.

    Please look at the given user query, recipe generated by another LLM and follow the rubric below. Provide a score between 1-5 on how good the recipe is.

    user query : "{query_question}"

    Recipe generated by another LLM: "{query_response}"

    Recipe review rubric: 

    Grading Scale (1-5)

    5 - Exceptional Recipe:

    1. Accuracy: The recipe is highly accurate and closely matches the user's query, including all specified ingredients, dietary restrictions, and desired cuisine type.
    2. Clarity: The instructions are clear, easy to follow, and logically sequenced. Cooking times and temperatures are precise.
    3. Creativity: The recipe demonstrates creativity, offering a unique or interesting twist on a classic dish or a novel combination of ingredients.
    4. Completeness: The recipe includes all necessary details, such as ingredient measurements, preparation steps, serving suggestions, and any relevant tips or variations.
    5. Healthiness: The recipe provides a balanced nutritional profile, aligning with any specified health goals or dietary considerations.
    6. User Feedback: The recipe is likely to receive high ratings from users for both taste and ease of preparation.
    4 - Very Good Recipe:

    1. Accuracy: The recipe mostly matches the user's query with minor deviations or substitutions that still align with the user's dietary restrictions and preferences.
    2. Clarity: The instructions are clear and easy to follow, with only minor areas that could benefit from additional detail.
    3. Creativity: The recipe shows some creativity and presents an appealing dish, though it may not be as unique as a 5-rated recipe.
    4. Completeness: The recipe includes most necessary details, but might miss a few minor tips or variations.
    5. Healthiness: The recipe is generally healthy, though it may not be as nutritionally balanced as a 5-rated recipe.
    6. User Feedback: The recipe is likely to receive good ratings from users, being tasty and reasonably easy to prepare.
    3 - Good Recipe:

    1. Accuracy: The recipe has a reasonable match with the user's query but may include some inaccuracies or ingredient substitutions that slightly alter the dish's nature.
    2. Clarity: The instructions are generally clear but may have a few confusing steps or lack detailed guidance in some areas.
    3. Creativity: The recipe is standard with minimal creativity or uniqueness.
    4. Completeness: The recipe includes the essential details but lacks additional helpful information or suggestions.
    5. Healthiness: The recipe is moderately healthy but may lack balance in terms of nutritional profile.
    6. User Feedback: The recipe is expected to receive average ratings, being satisfactory but not outstanding in taste or ease of preparation.
    2 - Fair Recipe:

    1. Accuracy: The recipe has noticeable discrepancies from the user's query, potentially including ingredients that were supposed to be excluded due to dietary restrictions.
    2. Clarity: The instructions are unclear or difficult to follow, with significant gaps or ambiguities.
    3. Creativity: The recipe lacks creativity and may appear bland or uninspired.
    4. Completeness: The recipe is missing several important details, such as precise measurements or key preparation steps.
    5. Healthiness: The recipe is not particularly healthy and may have an unbalanced nutritional profile.
    6. User Feedback: The recipe is likely to receive below-average ratings due to issues with taste, clarity, or preparation difficulty.
    1 - Poor Recipe:

    1. Accuracy: The recipe significantly deviates from the user's query, ignoring key dietary restrictions or preferences.
    2. Clarity: The instructions are confusing, incomplete, or incorrect, making the recipe difficult or impossible to follow.
    3. Creativity: The recipe is not creative and may seem haphazard or poorly thought out.
    4. Completeness: The recipe is missing critical details, such as major ingredients, steps, or cooking times.
    5. Healthiness: The recipe is unhealthy and lacks a balanced nutritional profile.
    6. User Feedback: The recipe is likely to receive low ratings due to poor taste, difficulty in preparation, or failure to meet user expectations.

    Return the output in JSON format with keys: "Score" (1-5), "Reasoning" (list of dicts with "Accuracy", "Clarity", "Creativity", "Completeness", "Healthiness", "User feedback").
    """

    eval_resp = rag_chain_eval.invoke(eval_message)
    print(eval_resp)
    score  = int(eval_resp['content'][0]['text'].split()[2].split(",")[0])
    # reasoning = json.loads(eval_resp['content'][0]['text'])['Reasoning']
    # print(eval_resp['content'][0]['text'])
   
    return score
    

In [75]:
def llm_gatekeeper(query_question, query_response):

    eval_message = f"""
    You are a helpful assistant and expert in reviewing and judging cooking recipes. 
    Please be as unbiased and impartial towards the recipe.
    Please look at the given user query, recipe generated by another LLM. 
    Return if the response correctly follows the user's request, dietary restrictions and preferences. Answer, Yes or No.
    user query : "{query_question}"
    Recipe generated by another LLM: "{query_response}"
    Return the output in JSON format with keys: "Answer" (Yes or No), "Reasoning" (Provide one sentence reasoning).
    """

    eval_resp = rag_chain_eval.invoke(eval_message)
    print(eval_resp)
    answer  = eval_resp['content'][0]['text'].split()[2].split(",")[0].replace('"', '')
    #reasoning = json.loads(json.loads(json.dumps(eval_resp))['content'][0]['text'])['Reasoning']
    # reasoning = json.loads(eval_resp['content'][0]['text'])['Reasoning']
    print(eval_resp['content'][0]['text'])

   
    return answer 


In [76]:
def append_llm_responses(df, is_gate_keeper = True):
    llm_scores = []
    llm_score_reasoning = []
    gate_keeper = []
    gate_keeper_reasoning = []
    for index, row in df.iterrows():

        # TODO : Change Test Query name and Test Query Response name
        query_question = row['Query_Question']
        query_response = row['Query_Response']
        if is_gate_keeper:
            gate_keeper_response = llm_gatekeeper(query_question, query_response)
            if gate_keeper_response == "Yes":
                gate_keeper_response = 1
            else:
                gate_keeper_response = 0
            gate_keeper.append(gate_keeper_response)
            # gate_keeper_reasoning.append(gate_keeper_response_reasoning)

        else:

            score= llm_judge_eval(query_question, query_response)
            llm_scores.append(score)
            # llm_score_reasoning.append(score_reasoning)
        

    # Append the LLM responses to the DataFrame
    if is_gate_keeper:
        df['LLM GateKeeper Judge'] = gate_keeper
        # df['LLM GateKeeper Judge Reasoning'] = gate_keeper_reasoning
    else:
        df['LLM Score'] = llm_scores
        # df['LLM Score Reasoning'] = llm_score_reasoning
    return df




In [77]:
# Call the function and get the updated DataFrame
gate_keeper_response = append_llm_responses(gate_keeper_receipe_df, is_gate_keeper=True)

{'id': 'msg_bdrk_01EnChrsMy7kevsrMc6crgfT', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-haiku-20240307', 'content': [{'type': 'text', 'text': '{\n    "Answer": "Yes",\n    "Reasoning": "The recipe provided is nut-free and suitable for someone with a peanut allergy, as requested in the user query."\n}'}], 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 686, 'output_tokens': 48}}
{
    "Answer": "Yes",
    "Reasoning": "The recipe provided is nut-free and suitable for someone with a peanut allergy, as requested in the user query."
}
{'id': 'msg_bdrk_01YH15Si2UQjpRrLzGwrhXL8', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-haiku-20240307', 'content': [{'type': 'text', 'text': '{\n    "Answer": "Yes",\n    "Reasoning": "The recipe provided is a vegetarian dinner option that does not contain any shellfish, which meets the user\'s requirement of a shellfish-free dinner recipe."\n}'}], 'stop_reason': 'end_turn', 'stop_sequence': None, 'us

In [78]:
normal_recipe_response = append_llm_responses(normal_recipe_df, is_gate_keeper=False)

{'id': 'msg_bdrk_01W9VBeFGwjTx8LsJoW3pcDY', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-haiku-20240307', 'content': [{'type': 'text', 'text': '{\n    "Score": 4,\n    "Reasoning": [\n        {\n            "Accuracy": 4,\n            "Clarity": 4,\n            "Creativity": 4,\n            "Completeness": 4,\n            "Healthiness": 4,\n            "User Feedback": 4\n        }\n    ]\n}\n\nExplanation:\nThe recipe provided is a very good match for the user\'s query. It includes all the key ingredients and instructions for a vegetarian Asian fusion dish, as requested. The instructions are clear and easy to follow, with precise cooking times and temperatures. The recipe demonstrates some creativity by combining various Asian flavors and vegetables in an interesting way. It also appears to be a relatively healthy and balanced dish. Overall, the recipe is likely to receive good ratings from users for both taste and ease of preparation.'}], 'stop_reason': 'end_turn', 'sto

In [79]:
gate_keeper_response_extra = gate_keeper_response
normal_recipe_response_extra = normal_recipe_response

In [95]:
gate_keeper_response_llm_score_original = pd.read_csv("../data/gatekeeper_response_llm_score.csv")
normal_recipe_response_llm_score_original = pd.read_csv("../data/normal_recipe_response_llm_score.csv")

In [96]:
gate_keeper_response_extra

Unnamed: 0,Query_Question_No,Query_Question,top_p,top_k,temperature,max_tokens,system_prompt,retriever,coarse_search_type,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,self_query_api,self_query_model,Query_Response,LLM GateKeeper Judge
0,1,I am allergic to peanuts. Can you suggest a de...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe search for nu...,1
1,3,I have a shellfish allergy. Please provide a d...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe second set of...,1
2,4,I follow a vegan diet. Can you give me a break...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,The Vegan Mediterranean Scramble looks like a ...,1
3,5,I need a gluten-free lunch recipe. What can yo...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe latest query ...,1
4,6,I don't eat dairy products. Can you provide a ...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,Based on the dairy-free pasta recipes retrieve...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,21,"I am looking for a recipe for a cake, but it s...",0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe additional se...,1
92,22,I am allergic to peanuts. Can you suggest a pe...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe supplemental ...,1
93,23,I follow a vegan diet but also want a recipe w...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe additional se...,0
94,24,"I need a gluten-free pasta recipe, but please ...",0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,Based on the gluten-free pasta recipes retriev...,0


In [133]:
gate_keeper_response_extra['temperature'].mean()

0.5

In [97]:
gate_keeper_response_llm_score_original.drop('Unnamed: 0', axis = 1, inplace = True)
normal_recipe_response_llm_score_original.drop('Unnamed: 0', axis = 1, inplace = True)

In [98]:
final_gate_keeper_response_llm_score = pd.concat([gate_keeper_response_llm_score_original,gate_keeper_response_extra])
final_normal_recipe_response_llm_score = pd.concat([normal_recipe_response_llm_score_original,normal_recipe_response_extra])

In [100]:
final_normal_recipe_response_llm_score

Unnamed: 0,Query_Question_No,Query_Question,top_p,top_k,temperature,max_tokens,system_prompt,retriever,coarse_search_type,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,self_query_api,self_query_model,Query_Response,LLM Score
0,1,I enjoy asian fusion food and I am a vegetaria...,0.1,10,0.1,3000,\nYou are a helpful assistant and expert in co...,coarse,similarity,5,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe additional se...,5
1,3,I have a peanut allergy but I like thai food. ...,0.1,10,0.1,3000,\nYou are a helpful assistant and expert in co...,coarse,similarity,5,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe web search re...,5
2,4,Suggest a low-carb breakfast recipe that inclu...,0.1,10,0.1,3000,\nYou are a helpful assistant and expert in co...,coarse,similarity,5,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe additional Go...,5
3,5,Suggest a healthy dinner recipe for two people...,0.1,10,0.1,3000,\nYou are a helpful assistant and expert in co...,coarse,similarity,5,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe web search re...,5
4,6,I am on a ketogenic diet and need a dinner rec...,0.1,10,0.1,3000,\nYou are a helpful assistant and expert in co...,coarse,similarity,5,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe additional Go...,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55,12,I am on a heart-healthy diet and need a lunch ...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,Here is a great heart-healthy lunch recipe tha...,5
56,13,I am allergic to shellfish and soy. I need a v...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,This Japanese sweet potato and rice dish looks...,5
57,14,I am lactose intolerant and need a dessert rec...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe additional se...,5
58,15,"Suggest a high-fiber, low-calorie lunch recipe...",0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,mmr,20,0.5,1,100,OpenAI,gpt-4o-mini,"Based on the recipe search results, here is a ...",4


In [101]:
final_gate_keeper_response_llm_score

Unnamed: 0,Query_Question_No,Query_Question,top_p,top_k,temperature,max_tokens,system_prompt,retriever,coarse_search_type,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,self_query_api,self_query_model,Query_Response,LLM GateKeeper Judge
0,1,I am allergic to peanuts. Can you suggest a de...,0.9,10,1.0,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,similarity,5,0.5,1,100,OpenAI,gpt-4o-mini,Here is a delicious nut-free dessert recipe fo...,1
1,3,I have a shellfish allergy. Please provide a d...,0.9,10,1.0,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,similarity,5,0.5,1,100,OpenAI,gpt-4o-mini,"The query retrieved a delicious ""No Shells Cio...",1
2,4,I follow a vegan diet. Can you give me a break...,0.9,10,1.0,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,similarity,5,0.5,1,100,OpenAI,gpt-4o-mini,Here is a delicious vegan breakfast recipe for...,1
3,5,I need a gluten-free lunch recipe. What can yo...,0.9,10,1.0,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,similarity,5,0.5,1,100,OpenAI,gpt-4o-mini,Here is a delicious gluten-free lunch recipe f...,1
4,6,I don't eat dairy products. Can you provide a ...,0.9,10,1.0,3000,\nYou are a helpful assistant and expert in co...,self_query_chain,similarity,5,0.5,1,100,OpenAI,gpt-4o-mini,Based on the recipe results from the vector da...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,21,"I am looking for a recipe for a cake, but it s...",0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe additional se...,1
92,22,I am allergic to peanuts. Can you suggest a pe...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe supplemental ...,1
93,23,I follow a vegan diet but also want a recipe w...,0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,<search_quality_reflection>\nThe additional se...,0
94,24,"I need a gluten-free pasta recipe, but please ...",0.9,2,0.5,3000,\nYou are a helpful assistant and expert in co...,reranker,mmr,20,0.5,5,100,OpenAI,gpt-4o-mini,Based on the gluten-free pasta recipes retriev...,0


In [50]:
# Call the function and get the updated DataFrame
baseline_gate_keeper_eval_df = append_llm_responses(baseline_gate_keeper_query_df, is_gate_keeper=True)

{'id': 'msg_bdrk_014fAm5KeZEJ3rRmq6gxeUqe', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-haiku-20240307', 'content': [{'type': 'text', 'text': '{\n    "Answer": "Yes",\n    "Reasoning": "The recipe generated by the other LLM correctly follows the user\'s request for a peanut-free dessert recipe and provides several options that are safe for those with peanut allergies."\n}'}], 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 550, 'output_tokens': 62}}
{
    "Answer": "Yes",
    "Reasoning": "The recipe generated by the other LLM correctly follows the user's request for a peanut-free dessert recipe and provides several options that are safe for those with peanut allergies."
}
{'id': 'msg_bdrk_01LWL2CtPKT7QbckJmho2uA3', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-haiku-20240307', 'content': [{'type': 'text', 'text': '{\n    "Answer": "Yes",\n    "Reasoning": "The recipe provided is shellfish-free and follows the user\'s request for 

In [51]:
# Call the function and get the updated DataFrame
baseline_test_query_eval_df = append_llm_responses(baseline_test_query_df, is_gate_keeper=False)

{'id': 'msg_bdrk_01PJRTd3huD2nroEYeruwt3z', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-haiku-20240307', 'content': [{'type': 'text', 'text': '{\n    "Score": 4,\n    "Reasoning": [\n        {\n            "Accuracy": 4,\n            "Clarity": 4,\n            "Creativity": 4,\n            "Completeness": 4,\n            "Healthiness": 4,\n            "User Feedback": 4\n        }\n    ]\n}\n\nExplanation:\n\nThe recipe generated by the other LLM is a very good match for the user\'s query. It accurately includes all the key elements requested, such as being an Asian fusion dish that is vegetarian-friendly. The instructions are clear and easy to follow, with detailed steps and measurements provided.\n\nWhile the recipe doesn\'t necessarily demonstrate exceptional creativity, it does offer a unique twist on a classic Thai dish by incorporating a variety of fresh vegetables. The recipe also appears to be reasonably healthy, providing a balanced nutritional profile.\n\nOvera

In [52]:
#Evaluating Baseline recipe response by just the Claude Sonnet 3.5 LLM
baseline_gate_keeper_eval_df.mean()

  baseline_gate_keeper_eval_df.mean()


LLM GateKeeper Judge    0.916667
dtype: float64

In [53]:
baseline_test_query_eval_df.mean()

  baseline_test_query_eval_df.mean()


LLM Score    4.6
dtype: float64

In [54]:
baseline_gate_keeper_eval_df.to_csv("Baseline_Gate_Keeper_Query_Eval.csv")
baseline_test_query_eval_df.to_csv("Baseline_Test_Query_Eval.csv")

In [363]:
# dict_1 = {'id': 'msg_bdrk_01URZyeD5crwuzh5jcpEj12H', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-haiku-20240307', 'content': [{'type': 'text', 'text': '{\n    "Answer": "No",\n    "Reasoning": "The recipe provided is for a vegan seitan-based "chicken" substitute, which does not meet the user\'s request for a recipe with actual chicken."\n}'}], 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 780, 'output_tokens': 53}}


In [385]:
# dict_2 = {'id': 'msg_bdrk_01SPzvNNosmfdCD12DyW8MfC', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-haiku-20240307', 'content': [{'type': 'text', 'text': '{\n    "Score": 4,\n    "Reasoning": [\n        {\n            "Accuracy": 4,\n            "Clarity": 4,\n            "Creativity": 4,\n            "Completeness": 4,\n            "Healthiness": 4,\n            "User Feedback": 4\n        }\n    ]\n}\n\nExplanation:\nThe recipe generated by the other LLM is a very good match for the user\'s query. It includes all the key ingredients and follows the user\'s preference for a vegetarian Asian fusion dish. The instructions are clear and logically sequenced, making it easy to follow. While the recipe doesn\'t demonstrate exceptional creativity, it does offer an appealing and unique combination of flavors. The recipe includes all the necessary details, and the nutritional profile seems generally healthy, though it could be further optimized. Overall, this recipe is likely to receive good ratings from users for both taste and ease of preparation.'}], 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 1498, 'output_tokens': 214}}




In [560]:
dict_2 = {'id': 'msg_bdrk_01SPzvNNosmfdCD12DyW8MfC', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-haiku-20240307', 'content': [{'type': 'text', 'text': '{\n    "Score": 4,\n    "Reasoning": [\n        {\n            "Accuracy": 4,\n            "Clarity": 4,\n            "Creativity": 4,\n            "Completeness": 4,\n            "Healthiness": 4,\n            "User Feedback": 4\n        }\n    ]\n}\n\nExplanation:\nThe recipe generated by the other LLM is a very good match for the user\'s query. It includes all the key ingredients and follows the user\'s preference for a vegetarian Asian fusion dish. The instructions are clear and logically sequenced, making it easy to follow. While the recipe doesn\'t demonstrate exceptional creativity, it does offer an appealing and unique combination of flavors. The recipe includes all the necessary details, and the nutritional profile seems generally healthy, though it could be further optimized. Overall, this recipe is likely to receive good ratings from users for both taste and ease of preparation.'}], 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 1498, 'output_tokens': 214}}




In [102]:
min(final_normal_recipe_response_llm_score['LLM Score'])

4

In [103]:
final_normal_recipe_response_llm_score.groupby(["retriever"]).mean()

  final_normal_recipe_response_llm_score.groupby(["retriever"]).mean()


Unnamed: 0_level_0,Query_Question_No,top_p,top_k,temperature,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM Score
retriever,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
coarse,8.933333,0.5,6.0,0.533333,3000.0,5.0,0.5,1.0,100.0,4.866667
reranker,8.933333,0.557143,5.428571,0.528571,3000.0,7.142857,0.5,1.285714,100.0,4.709524
self_query_chain,8.933333,0.557143,5.428571,0.528571,3000.0,7.142857,0.5,1.285714,100.0,4.67619


In [104]:
final_normal_recipe_response_llm_score.groupby(["temperature"]).mean()

  final_normal_recipe_response_llm_score.groupby(["temperature"]).mean()


Unnamed: 0_level_0,Query_Question_No,top_p,top_k,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM Score
temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.1,8.933333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,4.783333
0.5,8.933333,0.6,5.0,3000.0,8.75,0.5,1.5,100.0,4.745833
1.0,8.933333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,4.705556


In [105]:
final_normal_recipe_response_llm_score.groupby(["top_p"]).mean()

  final_normal_recipe_response_llm_score.groupby(["top_p"]).mean()


Unnamed: 0_level_0,Query_Question_No,top_k,temperature,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM Score
top_p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.1,8.933333,6.0,0.533333,3000.0,5.0,0.5,1.0,100.0,4.762963
0.9,8.933333,5.272727,0.527273,3000.0,7.727273,0.5,1.363636,100.0,4.730303


In [106]:
final_normal_recipe_response_llm_score.groupby(["top_k"]).mean()

  final_normal_recipe_response_llm_score.groupby(["top_k"]).mean()


Unnamed: 0_level_0,Query_Question_No,top_p,temperature,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM Score
top_k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,8.933333,0.572727,0.527273,3000.0,7.727273,0.5,1.363636,100.0,4.745455
10,8.933333,0.5,0.533333,3000.0,5.0,0.5,1.0,100.0,4.744444


In [107]:
final_normal_recipe_response_llm_score.groupby(["retriever", "temperature"]).mean()

  final_normal_recipe_response_llm_score.groupby(["retriever", "temperature"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Query_Question_No,top_p,top_k,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM Score
retriever,temperature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
coarse,0.1,8.933333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,4.866667
coarse,0.5,8.933333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,4.9
coarse,1.0,8.933333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,4.833333
reranker,0.1,8.933333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,4.733333
reranker,0.5,8.933333,0.633333,4.666667,3000.0,10.0,0.5,1.666667,100.0,4.711111
reranker,1.0,8.933333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,4.683333
self_query_chain,0.1,8.933333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,4.75
self_query_chain,0.5,8.933333,0.633333,4.666667,3000.0,10.0,0.5,1.666667,100.0,4.677778
self_query_chain,1.0,8.933333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,4.6


In [108]:
final_normal_recipe_response_llm_score.groupby(["retriever", "temperature","top_p"]).mean()

  final_normal_recipe_response_llm_score.groupby(["retriever", "temperature","top_p"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Query_Question_No,top_k,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM Score
retriever,temperature,top_p,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
coarse,0.1,0.1,8.933333,6.0,3000.0,5.0,0.5,1.0,100.0,4.866667
coarse,0.1,0.9,8.933333,6.0,3000.0,5.0,0.5,1.0,100.0,4.866667
coarse,0.5,0.1,8.933333,6.0,3000.0,5.0,0.5,1.0,100.0,4.9
coarse,0.5,0.9,8.933333,6.0,3000.0,5.0,0.5,1.0,100.0,4.9
coarse,1.0,0.1,8.933333,6.0,3000.0,5.0,0.5,1.0,100.0,4.9
coarse,1.0,0.9,8.933333,6.0,3000.0,5.0,0.5,1.0,100.0,4.766667
reranker,0.1,0.1,8.933333,6.0,3000.0,5.0,0.5,1.0,100.0,4.733333
reranker,0.1,0.9,8.933333,6.0,3000.0,5.0,0.5,1.0,100.0,4.733333
reranker,0.5,0.1,8.933333,6.0,3000.0,5.0,0.5,1.0,100.0,4.733333
reranker,0.5,0.9,8.933333,4.0,3000.0,12.5,0.5,2.0,100.0,4.7


In [563]:
normal_recipe_response.groupby(["retriever", "temperature","top_p", "top_k"]).mean()

  normal_recipe_response.groupby(["retriever", "temperature","top_p", "top_k"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Query_Question_No,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM Score
retriever,temperature,top_p,top_k,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
coarse,0.1,0.1,2,8.933333,3000.0,5.0,0.5,1.0,100.0,4.8
coarse,0.1,0.1,10,8.933333,3000.0,5.0,0.5,1.0,100.0,4.933333
coarse,0.1,0.9,2,8.933333,3000.0,5.0,0.5,1.0,100.0,5.0
coarse,0.1,0.9,10,8.933333,3000.0,5.0,0.5,1.0,100.0,4.733333
coarse,0.5,0.1,2,8.933333,3000.0,5.0,0.5,1.0,100.0,4.866667
coarse,0.5,0.1,10,8.933333,3000.0,5.0,0.5,1.0,100.0,4.933333
coarse,0.5,0.9,2,8.933333,3000.0,5.0,0.5,1.0,100.0,4.933333
coarse,0.5,0.9,10,8.933333,3000.0,5.0,0.5,1.0,100.0,4.866667
coarse,1.0,0.1,2,8.933333,3000.0,5.0,0.5,1.0,100.0,4.933333
coarse,1.0,0.1,10,8.933333,3000.0,5.0,0.5,1.0,100.0,4.866667


In [109]:
final_gate_keeper_response_llm_score.groupby(["retriever"]).mean()

  final_gate_keeper_response_llm_score.groupby(["retriever"]).mean()


Unnamed: 0_level_0,Query_Question_No,top_p,top_k,temperature,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM GateKeeper Judge
retriever,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
coarse,13.458333,0.5,6.0,0.533333,3000.0,5.0,0.5,1.0,100.0,0.9375
reranker,13.458333,0.557143,5.428571,0.528571,3000.0,7.142857,0.5,1.285714,100.0,0.89881
self_query_chain,13.458333,0.557143,5.428571,0.528571,3000.0,7.142857,0.5,1.285714,100.0,0.91369


In [110]:
final_gate_keeper_response_llm_score.groupby(["temperature"]).mean()

  final_gate_keeper_response_llm_score.groupby(["temperature"]).mean()


Unnamed: 0_level_0,Query_Question_No,top_p,top_k,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM GateKeeper Judge
temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.1,13.458333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,0.902778
0.5,13.458333,0.6,5.0,3000.0,8.75,0.5,1.5,100.0,0.919271
1.0,13.458333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,0.923611


In [111]:
final_gate_keeper_response_llm_score.groupby(["top_p"]).mean()

  final_gate_keeper_response_llm_score.groupby(["top_p"]).mean()


Unnamed: 0_level_0,Query_Question_No,top_k,temperature,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM GateKeeper Judge
top_p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.1,13.458333,6.0,0.533333,3000.0,5.0,0.5,1.0,100.0,0.907407
0.9,13.458333,5.272727,0.527273,3000.0,7.727273,0.5,1.363636,100.0,0.922348


In [112]:
final_gate_keeper_response_llm_score.groupby(["top_k"]).mean()

  final_gate_keeper_response_llm_score.groupby(["top_k"]).mean()


Unnamed: 0_level_0,Query_Question_No,top_p,temperature,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM GateKeeper Judge
top_k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,13.458333,0.572727,0.527273,3000.0,7.727273,0.5,1.363636,100.0,0.909091
10,13.458333,0.5,0.533333,3000.0,5.0,0.5,1.0,100.0,0.923611


In [113]:
final_gate_keeper_response_llm_score.groupby(["retriever", "temperature"]).mean()

  final_gate_keeper_response_llm_score.groupby(["retriever", "temperature"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Query_Question_No,top_p,top_k,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM GateKeeper Judge
retriever,temperature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
coarse,0.1,13.458333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,0.9375
coarse,0.5,13.458333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,0.9375
coarse,1.0,13.458333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,0.9375
reranker,0.1,13.458333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,0.854167
reranker,0.5,13.458333,0.633333,4.666667,3000.0,10.0,0.5,1.666667,100.0,0.916667
reranker,1.0,13.458333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,0.916667
self_query_chain,0.1,13.458333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,0.916667
self_query_chain,0.5,13.458333,0.633333,4.666667,3000.0,10.0,0.5,1.666667,100.0,0.909722
self_query_chain,1.0,13.458333,0.5,6.0,3000.0,5.0,0.5,1.0,100.0,0.916667


In [114]:
final_gate_keeper_response_llm_score.groupby(["retriever", "temperature", "top_p"]).mean()

  final_gate_keeper_response_llm_score.groupby(["retriever", "temperature", "top_p"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Query_Question_No,top_k,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM GateKeeper Judge
retriever,temperature,top_p,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
coarse,0.1,0.1,13.458333,6.0,3000.0,5.0,0.5,1.0,100.0,0.9375
coarse,0.1,0.9,13.458333,6.0,3000.0,5.0,0.5,1.0,100.0,0.9375
coarse,0.5,0.1,13.458333,6.0,3000.0,5.0,0.5,1.0,100.0,0.9375
coarse,0.5,0.9,13.458333,6.0,3000.0,5.0,0.5,1.0,100.0,0.9375
coarse,1.0,0.1,13.458333,6.0,3000.0,5.0,0.5,1.0,100.0,0.916667
coarse,1.0,0.9,13.458333,6.0,3000.0,5.0,0.5,1.0,100.0,0.958333
reranker,0.1,0.1,13.458333,6.0,3000.0,5.0,0.5,1.0,100.0,0.833333
reranker,0.1,0.9,13.458333,6.0,3000.0,5.0,0.5,1.0,100.0,0.875
reranker,0.5,0.1,13.458333,6.0,3000.0,5.0,0.5,1.0,100.0,0.916667
reranker,0.5,0.9,13.458333,4.0,3000.0,12.5,0.5,2.0,100.0,0.916667


In [564]:
gate_keeper_response.groupby(["retriever", "temperature","top_p", "top_k"]).mean()

  gate_keeper_response.groupby(["retriever", "temperature","top_p", "top_k"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Query_Question_No,max_tokens,coarse_top_k,coarse_lambda,reranker_top_n,coarse_fetch_k,LLM GateKeeper Judge
retriever,temperature,top_p,top_k,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
coarse,0.1,0.1,2,13.458333,3000.0,5.0,0.5,1.0,100.0,0.916667
coarse,0.1,0.1,10,13.458333,3000.0,5.0,0.5,1.0,100.0,0.958333
coarse,0.1,0.9,2,13.458333,3000.0,5.0,0.5,1.0,100.0,0.916667
coarse,0.1,0.9,10,13.458333,3000.0,5.0,0.5,1.0,100.0,0.958333
coarse,0.5,0.1,2,13.458333,3000.0,5.0,0.5,1.0,100.0,0.958333
coarse,0.5,0.1,10,13.458333,3000.0,5.0,0.5,1.0,100.0,0.916667
coarse,0.5,0.9,2,13.458333,3000.0,5.0,0.5,1.0,100.0,0.958333
coarse,0.5,0.9,10,13.458333,3000.0,5.0,0.5,1.0,100.0,0.916667
coarse,1.0,0.1,2,13.458333,3000.0,5.0,0.5,1.0,100.0,0.916667
coarse,1.0,0.1,10,13.458333,3000.0,5.0,0.5,1.0,100.0,0.916667


In [123]:
gate_keeper_score = final_gate_keeper_response_llm_score.groupby(["retriever", "temperature","top_p", "top_k"]).mean().reset_index()

  gate_keeper_score = final_gate_keeper_response_llm_score.groupby(["retriever", "temperature","top_p", "top_k"]).mean().reset_index()


In [124]:
normal_recipe_score = final_normal_recipe_response_llm_score.groupby(["retriever", "temperature","top_p", "top_k"]).mean().reset_index()

  normal_recipe_score = final_normal_recipe_response_llm_score.groupby(["retriever", "temperature","top_p", "top_k"]).mean().reset_index()


In [125]:
gate_keeper_score.drop(["Query_Question_No", "max_tokens", "coarse_lambda", "coarse_fetch_k"], axis=1, inplace=True)

In [126]:
normal_recipe_score.drop(["Query_Question_No", "max_tokens", "coarse_lambda", "coarse_fetch_k"], axis=1, inplace=True)

In [127]:
final_df = gate_keeper_score.merge(normal_recipe_score,  how='inner', on=['temperature', "top_p", "top_k", "retriever"])

In [128]:
final_df['Total_Score'] = (final_df['LLM GateKeeper Judge'] + final_df['LLM Score'])/2

In [129]:
final_df.sort_values(by=['Total_Score'], ascending=False).head(10)

Unnamed: 0,retriever,temperature,top_p,top_k,coarse_top_k_x,reranker_top_n_x,LLM GateKeeper Judge,coarse_top_k_y,reranker_top_n_y,LLM Score,Total_Score
2,coarse,0.1,0.9,2,5.0,1.0,0.916667,5.0,1.0,5.0,2.958333
1,coarse,0.1,0.1,10,5.0,1.0,0.958333,5.0,1.0,4.933333,2.945833
6,coarse,0.5,0.9,2,5.0,1.0,0.958333,5.0,1.0,4.933333,2.945833
5,coarse,0.5,0.1,10,5.0,1.0,0.916667,5.0,1.0,4.933333,2.925
8,coarse,1.0,0.1,2,5.0,1.0,0.916667,5.0,1.0,4.933333,2.925
4,coarse,0.5,0.1,2,5.0,1.0,0.958333,5.0,1.0,4.866667,2.9125
25,self_query_chain,0.1,0.1,10,5.0,1.0,0.958333,5.0,1.0,4.866667,2.9125
27,self_query_chain,0.1,0.9,10,5.0,1.0,0.916667,5.0,1.0,4.866667,2.891667
7,coarse,0.5,0.9,10,5.0,1.0,0.916667,5.0,1.0,4.866667,2.891667
9,coarse,1.0,0.1,10,5.0,1.0,0.916667,5.0,1.0,4.866667,2.891667


In [136]:
final_df.sort_values(by=['Total_Score'], ascending=False).head(100)

Unnamed: 0,retriever,temperature,top_p,top_k,coarse_top_k_x,reranker_top_n_x,LLM GateKeeper Judge,coarse_top_k_y,reranker_top_n_y,LLM Score,Total_Score
2,coarse,0.1,0.9,2,5.0,1.0,0.916667,5.0,1.0,5.0,2.958333
1,coarse,0.1,0.1,10,5.0,1.0,0.958333,5.0,1.0,4.933333,2.945833
6,coarse,0.5,0.9,2,5.0,1.0,0.958333,5.0,1.0,4.933333,2.945833
5,coarse,0.5,0.1,10,5.0,1.0,0.916667,5.0,1.0,4.933333,2.925
8,coarse,1.0,0.1,2,5.0,1.0,0.916667,5.0,1.0,4.933333,2.925
4,coarse,0.5,0.1,2,5.0,1.0,0.958333,5.0,1.0,4.866667,2.9125
25,self_query_chain,0.1,0.1,10,5.0,1.0,0.958333,5.0,1.0,4.866667,2.9125
27,self_query_chain,0.1,0.9,10,5.0,1.0,0.916667,5.0,1.0,4.866667,2.891667
7,coarse,0.5,0.9,10,5.0,1.0,0.916667,5.0,1.0,4.866667,2.891667
9,coarse,1.0,0.1,10,5.0,1.0,0.916667,5.0,1.0,4.866667,2.891667


In [137]:
final_normal_recipe_response_llm_score.to_csv("normal_recipe_response_llm_score_complete.csv")

In [138]:
final_gate_keeper_response_llm_score.to_csv("gatekeeper_response_llm_score_complete.csv")

### Use the result below for the latest run -> 40 configurations 

In [7]:
llm_quality_score_df = pd.read_csv("../data/normal_recipe_response_llm_score_complete.csv")
llm_gate_keeper_score_df = pd.read_csv("../data/gatekeeper_response_llm_score_complete.csv")

In [8]:
llm_gate_keeper_score_df.columns

Index(['Unnamed: 0', 'Query_Question_No', 'Query_Question', 'top_p', 'top_k',
       'temperature', 'max_tokens', 'system_prompt', 'retriever',
       'coarse_search_type', 'coarse_top_k', 'coarse_lambda', 'reranker_top_n',
       'coarse_fetch_k', 'self_query_api', 'self_query_model',
       'Query_Response', 'LLM GateKeeper Judge'],
      dtype='object')

In [9]:
llm_gate_keeper_score = llm_gate_keeper_score_df.groupby(["retriever","coarse_search_type","coarse_top_k","reranker_top_n","temperature", "top_p", "top_k"]).mean().reset_index()

  llm_gate_keeper_score = llm_gate_keeper_score_df.groupby(["retriever","coarse_search_type","coarse_top_k","reranker_top_n","temperature", "top_p", "top_k"]).mean().reset_index()


In [10]:
llm_quality_score = llm_quality_score_df.groupby(["retriever","coarse_search_type","coarse_top_k","reranker_top_n","temperature", "top_p", "top_k"]).mean().reset_index()

  llm_quality_score = llm_quality_score_df.groupby(["retriever","coarse_search_type","coarse_top_k","reranker_top_n","temperature", "top_p", "top_k"]).mean().reset_index()


In [11]:
llm_gate_keeper_score.drop(["Query_Question_No", "max_tokens", "coarse_lambda", "Unnamed: 0"], axis=1, inplace=True)
llm_quality_score.drop(["Query_Question_No", "max_tokens", "coarse_lambda", "Unnamed: 0"], axis=1, inplace=True)


In [12]:
llm_gate_keeper_score

Unnamed: 0,retriever,coarse_search_type,coarse_top_k,reranker_top_n,temperature,top_p,top_k,coarse_fetch_k,LLM GateKeeper Judge
0,coarse,similarity,5,1,0.1,0.1,2,100.0,0.916667
1,coarse,similarity,5,1,0.1,0.1,10,100.0,0.958333
2,coarse,similarity,5,1,0.1,0.9,2,100.0,0.916667
3,coarse,similarity,5,1,0.1,0.9,10,100.0,0.958333
4,coarse,similarity,5,1,0.5,0.1,2,100.0,0.958333
5,coarse,similarity,5,1,0.5,0.1,10,100.0,0.916667
6,coarse,similarity,5,1,0.5,0.9,2,100.0,0.958333
7,coarse,similarity,5,1,0.5,0.9,10,100.0,0.916667
8,coarse,similarity,5,1,1.0,0.1,2,100.0,0.916667
9,coarse,similarity,5,1,1.0,0.1,10,100.0,0.916667


In [13]:
llm_quality_score

Unnamed: 0,retriever,coarse_search_type,coarse_top_k,reranker_top_n,temperature,top_p,top_k,coarse_fetch_k,LLM Score
0,coarse,similarity,5,1,0.1,0.1,2,100.0,4.8
1,coarse,similarity,5,1,0.1,0.1,10,100.0,4.933333
2,coarse,similarity,5,1,0.1,0.9,2,100.0,5.0
3,coarse,similarity,5,1,0.1,0.9,10,100.0,4.733333
4,coarse,similarity,5,1,0.5,0.1,2,100.0,4.866667
5,coarse,similarity,5,1,0.5,0.1,10,100.0,4.933333
6,coarse,similarity,5,1,0.5,0.9,2,100.0,4.933333
7,coarse,similarity,5,1,0.5,0.9,10,100.0,4.866667
8,coarse,similarity,5,1,1.0,0.1,2,100.0,4.933333
9,coarse,similarity,5,1,1.0,0.1,10,100.0,4.866667


In [14]:
final_score_df = llm_gate_keeper_score.merge(llm_quality_score,  how='inner', on=["retriever","coarse_top_k","reranker_top_n","temperature", "top_p", "top_k"])

In [15]:
final_score_df['Total_Score'] = (final_score_df['LLM GateKeeper Judge'] + final_score_df['LLM Score'])/2

In [16]:
final_score_df

Unnamed: 0,retriever,coarse_search_type_x,coarse_top_k,reranker_top_n,temperature,top_p,top_k,coarse_fetch_k_x,LLM GateKeeper Judge,coarse_search_type_y,coarse_fetch_k_y,LLM Score,Total_Score
0,coarse,similarity,5,1,0.1,0.1,2,100.0,0.916667,similarity,100.0,4.8,2.858333
1,coarse,similarity,5,1,0.1,0.1,10,100.0,0.958333,similarity,100.0,4.933333,2.945833
2,coarse,similarity,5,1,0.1,0.9,2,100.0,0.916667,similarity,100.0,5.0,2.958333
3,coarse,similarity,5,1,0.1,0.9,10,100.0,0.958333,similarity,100.0,4.733333,2.845833
4,coarse,similarity,5,1,0.5,0.1,2,100.0,0.958333,similarity,100.0,4.866667,2.9125
5,coarse,similarity,5,1,0.5,0.1,10,100.0,0.916667,similarity,100.0,4.933333,2.925
6,coarse,similarity,5,1,0.5,0.9,2,100.0,0.958333,similarity,100.0,4.933333,2.945833
7,coarse,similarity,5,1,0.5,0.9,10,100.0,0.916667,similarity,100.0,4.866667,2.891667
8,coarse,similarity,5,1,1.0,0.1,2,100.0,0.916667,similarity,100.0,4.933333,2.925
9,coarse,similarity,5,1,1.0,0.1,10,100.0,0.916667,similarity,100.0,4.866667,2.891667


In [18]:
final_score_df.sort_values(by=['Total_Score'], ascending=False).head(40)

Unnamed: 0,retriever,coarse_search_type_x,coarse_top_k,reranker_top_n,temperature,top_p,top_k,coarse_fetch_k_x,LLM GateKeeper Judge,coarse_search_type_y,coarse_fetch_k_y,LLM Score,Total_Score
2,coarse,similarity,5,1,0.1,0.9,2,100.0,0.916667,similarity,100.0,5.0,2.958333
1,coarse,similarity,5,1,0.1,0.1,10,100.0,0.958333,similarity,100.0,4.933333,2.945833
6,coarse,similarity,5,1,0.5,0.9,2,100.0,0.958333,similarity,100.0,4.933333,2.945833
5,coarse,similarity,5,1,0.5,0.1,10,100.0,0.916667,similarity,100.0,4.933333,2.925
8,coarse,similarity,5,1,1.0,0.1,2,100.0,0.916667,similarity,100.0,4.933333,2.925
4,coarse,similarity,5,1,0.5,0.1,2,100.0,0.958333,similarity,100.0,4.866667,2.9125
29,self_query_chain,similarity,5,1,0.1,0.1,10,100.0,0.958333,similarity,100.0,4.866667,2.9125
7,coarse,similarity,5,1,0.5,0.9,10,100.0,0.916667,similarity,100.0,4.866667,2.891667
9,coarse,similarity,5,1,1.0,0.1,10,100.0,0.916667,similarity,100.0,4.866667,2.891667
31,self_query_chain,similarity,5,1,0.1,0.9,10,100.0,0.916667,similarity,100.0,4.866667,2.891667
