In [1]:
# !pip install boto3 sagemaker langchain langchain-community langchain-core faiss-cpu requests opensearch-py sentence-transformers langchain-text-splitters requests-aws4auth qdrant-client -U

In [84]:
# Import  modules and libraries 
import json
import boto3
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableMap

In [85]:
# Constants 
MODEL_ID = "anthropic.claude-3-haiku-20240307-v1:0"

In [86]:
bedrock_client = boto3.client('bedrock-runtime', region_name="us-east-1")

In [None]:
# TODO : Include Test API Call into this notebook 

In [87]:
baseline_user_prompt = """
### Here is a user prompt:
{query}
"""

In [88]:
def process_prompt(query_args):
    prompt_with_query = baseline_user_prompt.replace("{query}", query_args['query'])
    
    # This format doesn't matter much now, but we will use it later to 
    # persist chat history for continuous dialogue
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt_with_query
                }
            ]
        }
    ]
    
    return messages

In [89]:
# Eval LLM 
def query_eval_llm(messages):
    response = bedrock_client.invoke_model(
        modelId=MODEL_ID,
        body=json.dumps({
            'anthropic_version': 'bedrock-2023-05-31', # This is required to use chat style messages object 
            'messages': messages,
            'max_tokens': 3000,
            'temperature': 0.1, 
            'top_p': 0.9,
            'top_k' : 2,
        })
    )
    response_body = json.loads(response.get('body').read())
    
    return response_body

In [90]:
qdrant_rag_chain_eval = (
    RunnableMap(
         {"query": RunnablePassthrough()}
    )
    | process_prompt
    | query_eval_llm
)

In [91]:
# Import S3 bucket JSON file.

s3 = boto3.client('s3')
bucket_name = 'test-api-results'
file_key = 'demo_full'
# Download it to Thomas' local laptop
s3.download_file(bucket_name, file_key, f"/Users/tl164/Downloads/{file_key}")
# Download it to Sagemaker instance 
# s3.download_file(bucket_name, file_key, f'../data/{file_key}')

f = open(f'/Users/tl164/Downloads/{file_key}')

# returns JSON object as 
# a dictionary
data = json.load(f)

f.close()

response_df = pd.json_normalize(data)

### Still need to format the dataframe into a certain way
response_df

Unnamed: 0,test_query_1.query,test_query_1.response,test_query_2.query,test_query_2.response,test_query_3.query,test_query_3.response,test_query_4.query,test_query_4.response,test_query_5.query,test_query_5.response,...,config.COARSE_SEARCH_KWARGS.lambda_mult,config.COARSE_SEARCH_TYPE,config.DOWNLOAD_PATH,config.FILE_KEY,config.MAX_DOC_COUNT,config.MODEL_ID,config.RERANKER_TOP_N,config.RETRIEVER,config.SELF_QUERY_API,config.SELF_QUERY_MODEL
0,\nI enjoy asian fusion food and I am a vegetar...,<search_quality_reflection>\nThe new search re...,\nI have a peanut allergy but I like thai food...,<search_quality_reflection>\nThe focused query...,\nSuggest a low-carb breakfast recipe that inc...,<search_quality_reflection>\nThe third search ...,\nSuggest a healthy dinner recipe for two peop...,<search_quality_reflection>\nThe additional se...,\nI am on a ketogenic diet and need a dinner r...,Unfortunately I could not find any recipes in ...,...,0.5,mmr,data,recipes_w_cleaning_time_combined_features.parquet,150,anthropic.claude-3-sonnet-20240229-v1:0,1,self_query_chain,OpenAI,gpt-4o


In [98]:
def llm_judge_eval(query_question, query_response):
    
    eval_message = f"""
    You are a helpful assistant and expert in reviewing cooking recipes.

    Please look at the given user query, recipe generated by another LLM and follow the rubric below. Provide a score between 1-5 on how good the recipe is.

    user query : "{query_question}"

    Recipe generated by another LLM: "{query_response}"

    Recipe review rubric: 

    Grading Scale (1-5)

    5 - Exceptional Recipe:

    1. Accuracy: The recipe is highly accurate and closely matches the user's query, including all specified ingredients, dietary restrictions, and desired cuisine type.
    2. Clarity: The instructions are clear, easy to follow, and logically sequenced. Cooking times and temperatures are precise.
    3. Creativity: The recipe demonstrates creativity, offering a unique or interesting twist on a classic dish or a novel combination of ingredients.
    4. Completeness: The recipe includes all necessary details, such as ingredient measurements, preparation steps, serving suggestions, and any relevant tips or variations.
    5. Healthiness: The recipe provides a balanced nutritional profile, aligning with any specified health goals or dietary considerations.
    6. User Feedback: The recipe is likely to receive high ratings from users for both taste and ease of preparation.
    4 - Very Good Recipe:

    1. Accuracy: The recipe mostly matches the user's query with minor deviations or substitutions that still align with the user's dietary restrictions and preferences.
    2. Clarity: The instructions are clear and easy to follow, with only minor areas that could benefit from additional detail.
    3. Creativity: The recipe shows some creativity and presents an appealing dish, though it may not be as unique as a 5-rated recipe.
    4. Completeness: The recipe includes most necessary details, but might miss a few minor tips or variations.
    5. Healthiness: The recipe is generally healthy, though it may not be as nutritionally balanced as a 5-rated recipe.
    6. User Feedback: The recipe is likely to receive good ratings from users, being tasty and reasonably easy to prepare.
    3 - Good Recipe:

    1. Accuracy: The recipe has a reasonable match with the user's query but may include some inaccuracies or ingredient substitutions that slightly alter the dish's nature.
    2. Clarity: The instructions are generally clear but may have a few confusing steps or lack detailed guidance in some areas.
    3. Creativity: The recipe is standard with minimal creativity or uniqueness.
    4. Completeness: The recipe includes the essential details but lacks additional helpful information or suggestions.
    5. Healthiness: The recipe is moderately healthy but may lack balance in terms of nutritional profile.
    6. User Feedback: The recipe is expected to receive average ratings, being satisfactory but not outstanding in taste or ease of preparation.
    2 - Fair Recipe:

    1. Accuracy: The recipe has noticeable discrepancies from the user's query, potentially including ingredients that were supposed to be excluded due to dietary restrictions.
    2. Clarity: The instructions are unclear or difficult to follow, with significant gaps or ambiguities.
    3. Creativity: The recipe lacks creativity and may appear bland or uninspired.
    4. Completeness: The recipe is missing several important details, such as precise measurements or key preparation steps.
    5. Healthiness: The recipe is not particularly healthy and may have an unbalanced nutritional profile.
    6. User Feedback: The recipe is likely to receive below-average ratings due to issues with taste, clarity, or preparation difficulty.
    1 - Poor Recipe:

    1. Accuracy: The recipe significantly deviates from the user's query, ignoring key dietary restrictions or preferences.
    2. Clarity: The instructions are confusing, incomplete, or incorrect, making the recipe difficult or impossible to follow.
    3. Creativity: The recipe is not creative and may seem haphazard or poorly thought out.
    4. Completeness: The recipe is missing critical details, such as major ingredients, steps, or cooking times.
    5. Healthiness: The recipe is unhealthy and lacks a balanced nutritional profile.
    6. User Feedback: The recipe is likely to receive low ratings due to poor taste, difficulty in preparation, or failure to meet user expectations.

    Return the output in JSON format with keys: "Score" (1-5), "Reasoning" (list of dicts with "Accuracy", "Clarity", "Creativity", "Completeness", "Healthiness", "User feedback").
    """

    eval_resp = qdrant_rag_chain_eval.invoke(eval_message)
    score  = json.loads(eval_resp['content'][0]['text'])['Score']
    reasoning = json.loads(eval_resp['content'][0]['text'])['Reasoning']
    print(eval_resp['content'][0]['text'])
   
    return score, reasoning
    

In [101]:
def llm_gatekeeper(query_question, query_response):

    eval_message = f"""
    You are a helpful assistant and expert in reviewing and judging cooking recipes. 
    Please be as unbiased and impartial towards the recipe.
    Please look at the given user query, recipe generated by another LLM. 
    Return if the response correctly follows the user's request, dietary restrictions and preferences. Answer, Yes or No.
    user query : "{query_question}"
    Recipe generated by another LLM: "{query_response}"
    Return the output in JSON format with keys: "Answer" (Yes or No), "Reasoning" (Provide one sentence reasoning).
    """

    eval_resp = qdrant_rag_chain_eval.invoke(eval_message)
    answer  = json.loads(eval_resp['content'][0]['text'])['Answer']
    reasoning = json.loads(eval_resp['content'][0]['text'])['Reasoning']
    print(eval_resp['content'][0]['text'])
   
    return answer, reasoning
    


In [102]:
def append_llm_responses(df):
    llm_scores = []
    llm_score_reasoning = []
    gate_keeper = []
    gate_keeper_reasoning = []
    for index, row in response_df.iterrows():

        # TODO : Change Test Query name and Test Query Response name
        query_question = row['test_query_1.query']
        query_response = row['test_query_1.response']
        score, score_reasoning = llm_judge_eval(query_question, query_response)
        gate_keeper_response, gate_keeper_response_reasoning = llm_gatekeeper(query_question, query_response)

        llm_scores.append(score)
        llm_score_reasoning.append(score_reasoning)
        gate_keeper.append(gate_keeper_response)
        gate_keeper_reasoning.append(gate_keeper_response_reasoning)
    
    # Append the LLM responses to the DataFrame
    df['LLM Score'] = llm_scores
    df['LLM Score Reasoning'] = llm_score_reasoning
    df['LLM GateKeeper Judge'] = gate_keeper
    df['LLM GateKeeper Judge Reasoning'] = gate_keeper_reasoning
    return df

# Call the function and get the updated DataFrame
updated_df = append_llm_responses(response_df)

{
    "Score": 5,
    "Reasoning": [
        {
            "Accuracy": "The recipe is highly accurate and closely matches the user's query, including all specified ingredients, dietary restrictions, and desired cuisine type.",
            "Clarity": "The instructions are clear, easy to follow, and logically sequenced. Cooking times and temperatures are precise.",
            "Creativity": "The recipe demonstrates creativity, offering a unique or interesting twist on a classic dish with the combination of coconut-ginger curry sauce and udon noodles.",
            "Completeness": "The recipe includes all necessary details, such as ingredient measurements, preparation steps, serving suggestions, and relevant tips.",
            "Healthiness": "The recipe provides a balanced nutritional profile, aligning with the user's vegetarian dietary considerations.",
            "User Feedback": "The recipe is likely to receive high ratings from users for both taste and ease of preparation."
        

In [None]:
updated_df.to_csv("response_with_score.csv")