### Install dependencies

In [1]:
!pip install boto3 sagemaker langchain langchain-community langchain-core faiss-cpu requests opensearch-py sentence-transformers langchain-text-splitters requests-aws4auth qdrant-client -U



### Load CSV data from S3

In [2]:
!pwd

/home/ec2-user/SageMaker/Enterprise-RAG/notebooks


In [3]:
import boto3
import pandas as pd

s3 = boto3.client('s3')
bucket_name = 'recipes-rag'

In [4]:
file_key = 'recipes_w_cleaning_time_combined_features.parquet'
s3.download_file(bucket_name, file_key, f'../data/{file_key}')
df = pd.read_parquet(f'../data/{file_key}')

df.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions,CookTime_Minutes,PrepTime_Minutes,TotalTime_Minutes,Combined_Features_Clean
0,38.0,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09 21:46:00+00:00,Make and share this Low-Fat Berry Blue Frozen ...,[https://img.sndimg.com/food/image/upload/w_55...,...,3.6,30.2,3.2,4.0,,"[Toss 2 cups berries with sugar., Let stand fo...",1440,45,1485,Low-Fat Berry Blue Frozen Dessert Frozen Desse...
1,39.0,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29 13:12:00+00:00,Make and share this Biryani recipe from Food.com.,[https://img.sndimg.com/food/image/upload/w_55...,...,9.0,20.4,63.4,6.0,,[Soak saffron in warm milk for 5 minutes and p...,25,240,265,Biryani Chicken Breast Make share Biryani reci...
2,40.0,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05 19:52:00+00:00,This is from one of my first Good House Keepi...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.4,77.2,0.3,4.0,,"[Into a 1 quart Jar with tight fitting lid, pu...",5,30,35,Best Lemonade Beverages one first Good House K...
3,41.0,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03 14:54:00+00:00,This dish is best prepared a day in advance to...,[https://img.sndimg.com/food/image/upload/w_55...,...,17.3,32.1,29.3,2.0,4 kebabs,"[Drain the tofu, carefully squeezing out exces...",20,1440,1460,Carina's Tofu-Vegetable Kebabs Soy/Tofu dish b...
4,42.0,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19 06:19:00+00:00,Make and share this Cabbage Soup recipe from F...,[https://img.sndimg.com/food/image/upload/w_55...,...,4.8,17.7,4.3,4.0,,"[Mix everything together and bring to a boil.,...",30,20,50,Cabbage Soup Vegetable Make share Cabbage Soup...


In [5]:
df.columns

Index(['RecipeId', 'Name', 'AuthorId', 'AuthorName', 'CookTime', 'PrepTime',
       'TotalTime', 'DatePublished', 'Description', 'Images', 'RecipeCategory',
       'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings', 'RecipeYield', 'RecipeInstructions',
       'CookTime_Minutes', 'PrepTime_Minutes', 'TotalTime_Minutes',
       'Combined_Features_Clean'],
      dtype='object')

In [6]:
str(df.iloc[0]['Combined_Features_Clean'])

'Low-Fat Berry Blue Frozen Dessert Frozen Desserts Make share Low-Fat Berry Blue Frozen Dessert recipe Food.com. Dessert Low Protein Low Cholesterol Healthy Free Of... Summer Weeknight Freezer Easy'

### Load data into chunked documents

In [7]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

In [8]:
embedding_model = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


### Note: chunking may not be useful for us as the EDA has the following about description length
print(new_recipe['Combined_Features_Clean'].str.len().describe())


count    522517.000000\
mean        211.461774\
std         122.388646\
min          44.000000\
25%         131.000000\
50%         180.000000\
75%         255.000000\
max        4174.000000\
Name: Combined_Features_Clean, dtype: float64

In [10]:
def create_textsplitter(chunks, overlaps):
    splits = {}
    for chunk in chunks:
        for overlap in overlaps:
            splits[f'chunk{str(chunk)}_overlap{str(overlap)}'] = RecursiveCharacterTextSplitter(chunk_size=chunk, chunk_overlap=overlap)
    return splits

In [11]:
chunks = [128, 256, 512, 1024, 2048]
overlaps = [.25, .2, .15, .1]

text_splits = create_textsplitter(chunks, overlaps)

In [12]:
text_splits['chunk1024_overlap0.25']

<langchain_text_splitters.character.RecursiveCharacterTextSplitter at 0x7f3e2d9d72b0>

In [13]:
def create_documents(df):
    documents = []
    for index, row in df.iterrows():
        metadata = {
            'recipe_id': str(row['RecipeId']) if not pd.isna(row['RecipeId']) else 'No ID Available',
            'name': str(row['Name']) if not pd.isna(row['Name']) else 'No Name Available',
            'cook_time': str(row['CookTime']) if not pd.isna(row['CookTime']) else 'No Cook Time Available',
            'prep_time': str(row['PrepTime']) if not pd.isna(row['PrepTime']) else 'No Prep Time Available',
            'total_time': str(row['TotalTime']) if not pd.isna(row['TotalTime']) else 'No Total Time Available',
            'recipe_category': str(row['RecipeCategory']) if not pd.isna(row['RecipeCategory']) else 'No Category Available',
            'keywords': str(row['Keywords']) if not pd.isna(row['Keywords']).all() else 'No Keywords Available',
            'aggregated_rating': str(row['AggregatedRating']) if not pd.isna(row['AggregatedRating']) else 'No Rating Available',
            'review_count': str(row['ReviewCount']) if not pd.isna(row['ReviewCount']) else 'No Reviews Available',
            'calories': str(row['Calories']) if not pd.isna(row['Calories']) else 'No Calories Information Available',
            'fat_content': str(row['FatContent']) if not pd.isna(row['FatContent']) else 'No Fat Content Available',
            'saturated_fat_content': str(row['SaturatedFatContent']) if not pd.isna(row['SaturatedFatContent']) else 'No Saturated Fat Content Available',
            'cholesterol_content': str(row['CholesterolContent']) if not pd.isna(row['CholesterolContent']) else 'No Cholesterol Content Available',
            'sodium_content': str(row['SodiumContent']) if not pd.isna(row['SodiumContent']) else 'No Sodium Content Available',
            'carbohydrate_content': str(row['CarbohydrateContent']) if not pd.isna(row['CarbohydrateContent']) else 'No Carbohydrate Content Available',
            'sugar_content': str(row['SugarContent']) if not pd.isna(row['SugarContent']) else 'No Sugar Content Available',
            'protein_content': str(row['ProteinContent']) if not pd.isna(row['ProteinContent']) else 'No Protein Content Available',
            'recipe_servings': str(row['RecipeServings']) if not pd.isna(row['RecipeServings']) else 'No Servings Information Available',
            'recipe_yield': str(row['RecipeYield']) if not pd.isna(row['RecipeYield']) else 'No Yield Information Available'
        }

        # Use Combined_Features_Clean for the document content
        text = str(row['Combined_Features_Clean'])
        doc = Document(page_content=text, metadata=metadata)
        documents.append(doc)
        
    return documents


In [14]:
documents = create_documents(df.sample(n=100000))

In [15]:
documents[0]

Document(metadata={'recipe_id': '453572.0', 'name': 'One Feather Mopping Sauce', 'cook_time': 'No Cook Time Available', 'prep_time': 'PT15M', 'total_time': 'PT15M', 'recipe_category': 'Low Protein', 'keywords': "['Low Cholesterol' '< 15 Mins' 'Easy']", 'aggregated_rating': 'No Rating Available', 'review_count': 'No Reviews Available', 'calories': '725.0', 'fat_content': '54.9', 'saturated_fat_content': '4.1', 'cholesterol_content': '0.0', 'sodium_content': '197.2', 'carbohydrate_content': '28.8', 'sugar_content': '5.9', 'protein_content': '3.5', 'recipe_servings': '1.0', 'recipe_yield': '3 cups'}, page_content="One Feather Mopping Sauce Low Protein Leinenkugel's (Line - N - koo - gulls) Beer brewed 10 miles &amp; known many local's &quot;One Feather&quot;...if look label, understand. popular tourists &amp; even locals drink it...many cook it. try use BIG BUTT ribs (&amp; fish batter) &amp; Leinie's Light chicken, use whatever trips trigger. version &quot;mop&quot; ribs &amp; such. Low 

In [16]:
len(documents)

100000

In [None]:
def split_documents_with_metadata(documents, text_splitter):
    split_docs = []
    for doc in documents:
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            split_docs.append(Document(page_content=chunk, metadata={**doc.metadata, "chunk_id": i}))
    return split_docs

In [None]:
split_documents = {key: split_documents_with_metadata(documents, text_splitter_recursive) for key, text_splitter_recursive in text_splits.items()}

In [None]:
split_documents['chunk256_overlap0.25'][0]

In [17]:
# We have > 500,000 recipes, this takes a long time to run
from langchain_community.vectorstores import Qdrant

qdrant_store_nosplit = Qdrant.from_documents(documents,
    embedding_model,
    location=":memory:",
)

# qdrant_store = Qdrant.from_documents(split_documents['chunk1024_overlap0.25'],
#     embedding_model,
#     location=":memory:",
# )

KeyboardInterrupt: 

In [None]:
qdrant_retriever = qdrant_store_nosplit.as_retriever()

### Reranker retriever

In [None]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

qdrant_retriever_rerank = qdrant_store.as_retriever(search_kwargs={"k": 20})
reranker_model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")
compressor = CrossEncoderReranker(model=reranker_model, top_n=3)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=qdrant_retriever_rerank
)

compressed_docs = compression_retriever.invoke("What pizza can I make in 20 minutes?")
pretty_print_docs(compressed_docs)

### Agent function definitions

In [None]:
def format_docs(docs):
    formatted_docs = []
    for doc in docs:
        formatted_docs.append(f"Metadata: {doc.metadata}\n")
    content = "\n\n".join(formatted_docs)
    
    return content

In [None]:
recipe_db_query_tool = {
  "name": "query_food_recipe_vector_db",
  "description": """
      Queries the vector database containing food recipes to retrieve the most relevant documents. 
      This function allows the model to generate and execute multiple queries as necessary to gather comprehensive context, 
      such as ingredients, preparation steps, and metadata like cuisine and diet type, ensuring accurate and thorough responses to user queries.
      """,
  "input_schema": {
    "type": "object",
    "properties": {
      "queries": {
        "type": "array",
        "items": {
          "type": "string",
          "description": "A query generated by the model to run against the vector database to fetch recipe documents."
        },
        "description": "A list of queries generated by the model to run against the vector database to fetch recipe documents."
      }
    },
    "required": ["queries"]
  }
}


### Init bedrock model, define util to stateless messaging, no fn calling

In [None]:
import json

In [None]:
bedrock_client = boto3.client('bedrock-runtime', region_name="us-east-1")

In [None]:
# We will need to tune these prompts
# query_bedrock_llm() definition NEEDS TO BE RERUN
# each time when changes are made to this prompt

baseline_sys_prompt = """
You are a helpful assistant and expert in cooking recipes.

Before answering, always make at least one call to query_food_recipe_vector_db
to retrieve the relevant context of recipes and ingredients to generate an informed
and high-quality response to the user prompt but NEVER exceed a MAXIMUM of 
3 calls to the query_food_recipe_vector_db function.

Provide a response to the user prompt about food with recommended recipes and instructions.
"""

In [None]:
# We will need to tune these prompts

baseline_sys_prompt_basic = """
You are a helpful assistant and expert in cooking recipes.
You will be provided a list of relevant context about relevant food recipes
which includes metadata like ingredients and cooking instructions.

Provide a response to the user prompt about food with recommended recipes and instructions.
"""

In [None]:
MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0"

def query_bedrock_llm(messages):
    response = bedrock_client.invoke_model(
        modelId=MODEL_ID,
        body=json.dumps({
            'anthropic_version': 'bedrock-2023-05-31', # This is required to use chat style messages object 
            'system': baseline_sys_prompt,
            'messages': messages,
            'max_tokens': 3000,
            "tools": [recipe_db_query_tool],

            # This config forces the model to always call the recipe db query tool atleast once 
            # https://docs.anthropic.com/en/docs/build-with-claude/tool-use#controlling-claudes-output
            # "tool_choice": {
            #     "type": "tool",
            #     "name": recipe_db_query_tool['name']
            # },
            
            # TODO: TUNE THESE VALUES
            'temperature': 0.1, 
            'top_p': 0.9
        })
    )
    response_body = json.loads(response.get('body').read())
    
    return response_body

In [None]:
MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0"

def query_bedrock_llm_basic(messages):
    response = bedrock_client.invoke_model(
        modelId=MODEL_ID,
        body=json.dumps({
            'anthropic_version': 'bedrock-2023-05-31', # This is required to use chat style messages object 
            'system': baseline_sys_prompt_basic,
            'messages': messages,
            'max_tokens': 3000,
            
            # TODO: TUNE THESE VALUES
            'temperature': 0.1, 
            'top_p': 0.9
        })
    )
    response_body = json.loads(response.get('body').read())
    
    print(response_body['content'][0]['text'])
    
    return response_body['content'][0]['text']

### Pipe langchain together

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableMap

In [None]:
baseline_user_prompt_basic = """
### Here is a context:
{context} 

### Here is a user prompt:
{query}
"""

In [None]:
baseline_user_prompt = """
### Here is a user prompt:
{query}
"""

In [None]:
def process_prompt(query_args):
    prompt_with_query = baseline_user_prompt.replace("{query}", query_args['query'])
    
    # This format doesn't matter much now, but we will use it later to 
    # persist chat history for continuous dialogue
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt_with_query
                }
            ]
        }
    ]
    
    return messages

In [None]:
def process_prompt_basic(query_args):
    prompt_with_context = baseline_user_prompt_basic.replace("{context}", query_args['context'])
    prompt_with_query = prompt_with_context.replace("{query}", query_args['query'])
    
    # This format doesn't matter much now, but we will use it later to 
    # persist chat history for continuous dialogue
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt_with_query
                }
            ]
        }
    ]
    
    return messages

In [None]:
qdrant_rag_chain = (
    RunnableMap(
        # {"context": qdrant_retriever | format_docs,
         {"query": RunnablePassthrough()}
    )
    | process_prompt
    | query_bedrock_llm
    # | parse_event_stream
)

In [None]:
qdrant_rag_chain_basic = (
    RunnableMap(
        {"context": qdrant_retriever | format_docs,
         "query": RunnablePassthrough()}
    )
    | process_prompt_basic
    | query_bedrock_llm_basic
)

In [None]:
qdrant_rag_chain_rerank = (
    RunnableMap(
        {"context": compression_retriever | format_docs,
         "query": RunnablePassthrough()}
    )
    | process_prompt_basic
    | query_bedrock_llm_basic
)

### Model generates dynamic context queries to vector db

In [None]:
test_queries = {
    '1': """
    I enjoy asian fusion food and I am a vegetarian. 
    Give me one recipe with ingredients and instructions
    """,

    '2': """
    I have a peanut allergy but I like thai food. 
    I also don't enjoy spicy food much, and want a meal with low carbs. 
    Give a recipe with ingredients and instructions
    """,

    '3': """
    Suggest a low-carb breakfast recipe that includes eggs and spinach, 
    can be prepared in under 20 minutes, 
    and is suitable for a keto diet.
    """,

    '4': """
    Suggest a healthy dinner recipe for two people that includes fish, 
    is under 500 calories per serving, 
    and can be made in less than 40 minutes.
    """,

    '5': """
    I am on a ketogenic diet and need a dinner recipe that is dairy-free, 
    low in sodium, and takes less than an hour to cook.
    """,

    '6': """
    I'm looking for a pescatarian main course that is low in saturated fat, 
    uses Asian flavors, and can be prepared in under 45 minutes.
    """,

    '7': """
    I need a diabetic-friendly, vegan breakfast recipe that is gluten-free, 
    nut-free, and low in cholesterol, but also rich in omega-3 fatty acids 
    and can be prepared the night before.
    """,

    '8': """
    I am following a strict paleo diet and need a lunch recipe that is dairy-free, 
    gluten-free, low in carbs, and low in sodium. Additionally, it should be rich in antioxidants, 
    and can be made in under 30 minutes with minimal cooking equipment.
    """
}



In [None]:
qdrant_rag_chain.invoke(test_queries['1'])

### Implement continuous dialogue and function calling

In [None]:
def generate_message(prompt):
    if type(prompt) != str:
        raise ValueError(f'Tried to call message generate_message with non-string input: {prompt}')
        
    return {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": prompt
            }
        ]
    }

In [None]:
def generate_tool_message(fn_results):
    
    return {
        "role": "user",
        "content": fn_results
    }

In [None]:
import json

# Adds the current prompt as a new message to the chat history
# and calls bedrock with the entire chat history
# Returns the response body, llm's message, and new chat history

'''
Example response body structure:
{
   "id":"msg_bdrk_01C5GGkafK7aL3P5i3rsMr1p",
   "type":"message",
   "role":"assistant",
   "model":"claude-3-sonnet-20240229",
   "content":[
      {
         "type":"tool_use",
         "id":"toolu_bdrk_01CQiYa8BMJfpJC68DuRdwQn",
         "name":"query_food_recipe_vector_db",
         "input":{
            "queries":[
               "healthy fish dinner recipe under 500 calories",
               "fish dinner recipe for two under 40 minutes"
            ]
         }
      }
   ],
   "stop_reason":"tool_use",
   "stop_sequence":"None",
   "usage":{
      "input_tokens":559,
      "output_tokens":55
   }
}
'''
def message_handler(existing_chat_history, prompt, is_tool_message=False):
    # Fn results is an array of tool response objects
    # message structure needs to reflect that
    if is_tool_message:
        user_message = generate_tool_message(prompt)
    else:
        user_message = generate_message(prompt)
    existing_chat_history.append(user_message)

    # Parse the response content
    response_body = query_bedrock_llm(existing_chat_history)
    llm_message = {
        'role': response_body['role'],
        'content': response_body['content']
    }

    # Add the response message to the chat history
    existing_chat_history.append(llm_message)
    
    return [response_body, llm_message, existing_chat_history]

In [None]:
# Executes a list of queries and returns a list of document results
def handle_vector_db_queries(queries, retriever=qdrant_retriever): 
    context_docs = []
    for query in queries:
        query_results = retriever.invoke(query)
        context_docs.extend(query_results)

    return context_docs

In [None]:
# Takes as an argument to LLM message content, returns a list of the fn result objects
def handle_function_calls(tool_call_message_content, retriever=qdrant_retriever):
    tool_results = []
    
    for tool_call in tool_call_message_content:
        # Only process messages from the LLM that are function calls
        if tool_call['type'] != 'tool_use':
            continue
        fn_id = tool_call['id']
        fn_name = tool_call['name']
        fn_args = tool_call['input']
        fn_result = {
            "type": "tool_result",
            "tool_use_id": fn_id,
        }   

        if fn_name == 'query_food_recipe_vector_db':
            if 'queries' not in fn_args:
                print(f"ERROR: Tried to call {fn_name} with invalid args {fn_args}, skipping..")
                fn_result['content'] = ""
                fn_result['is_error'] = True
                tool_results.append(fn_result)
                continue
                
            print(f"Model called {fn_name} with args {fn_args}")
            context_docs = handle_vector_db_queries(fn_args['queries'], retriever)
            context_str = format_docs(context_docs)
            fn_result['content'] = context_str
            tool_results.append(fn_result)
            
        # TODO: handle web search invocation here
        else:
            print(f"ERROR: Attempted call to unknown function {fn_name}")
            fn_result['content'] = ""
            fn_result['is_error'] = True
            tool_results.append(fn_result)

    return tool_results

### Run dynamic query function calling with test queries

In [None]:
'''
Example payload structure of response_body:

{'id': 'msg_bdrk_01REesjegNiLteurBoxW7pSt',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-3-sonnet-20240229',
 'content': [{'type': 'tool_use',
   'id': 'toolu_bdrk_01191W2FuAFTRoDqKKeJSmmn',
   'name': 'query_food_recipe_vector_db',
   'input': {'queries': ['thai food',
     'peanut free',
     'low carb',
     'not spicy']}}],
 'stop_reason': 'tool_use',
 'stop_sequence': None,
 'usage': {'input_tokens': 573, 'output_tokens': 52}}


Example payload structure of llm_message['content']:

[{'type': 'tool_use',
   'id': 'toolu_bdrk_01191W2FuAFTRoDqKKeJSmmn',
   'name': 'query_food_recipe_vector_db',
   'input': {'queries': ['thai food',
     'peanut free',
     'low carb',
     'not spicy']}}]
'''

# This function is the entry point to invoke the LLM with support for function calling
# parsing output, calling requested functions, sending output is handled here
def run_chat_loop(prompt, retriever=qdrant_retriever):
    print(f"[User]: {prompt}")
    response_body, llm_message, chat_history = message_handler(existing_chat_history=[], prompt=prompt)
    
    
    # The model wants to call tools, call them, provide response, repeat until content is generated
    while response_body['stop_reason'] == 'tool_use':
        fn_results = handle_function_calls(tool_call_message_content=llm_message['content'], retriever=retriever)

        # Send function results back to LLM as a new message with the existing chat history
        response_body, llm_message, chat_history = message_handler(
            existing_chat_history=chat_history, 
            prompt=fn_results,
            is_tool_message=True
        )

    # The model is done calling tools
    print(f"\n[Model]: {llm_message['content'][0]['text']}")
    return f"\n[Model]: {llm_message['content'][0]['text']}"


In [None]:
run_chat_loop(test_queries['1'])

### Evaluation method
1. Use LLM as a judge 
2. Basic Sniff test

### Evaluation pipeline
1. Look at EDA result and determine the type of cuisine preesnt in the data. Come up with 8 "Test Questions". - Done. 
    test_query_1 to test_query_8
2. Feed the test questions in to the basic RAG, function calling RAG
3. Gather response and use the following two methods to evaluate them.
   1. LLM as a judge (provide grading ruberic) and ask the eval LLM to provide a score of 1-5.
   2. Sniff test 
  

### Recipe Grading Criteria - Pass into the Eval LLM as part of the prompt

Grading Scale (1-5)

5 - Exceptional Recipe:

    1. Accuracy: The recipe is highly accurate and closely matches the user's query, including all specified ingredients, dietary restrictions, and desired cuisine type.
    2. Clarity: The instructions are clear, easy to follow, and logically sequenced. Cooking times and temperatures are precise.
    3. Creativity: The recipe demonstrates creativity, offering a unique or interesting twist on a classic dish or a novel combination of ingredients.
    4. Completeness: The recipe includes all necessary details, such as ingredient measurements, preparation steps, serving suggestions, and any relevant tips or variations.
    5. Healthiness: The recipe provides a balanced nutritional profile, aligning with any specified health goals or dietary considerations.
    6. User Feedback: The recipe is likely to receive high ratings from users for both taste and ease of preparation.


4 - Very Good Recipe:

    1. Accuracy: The recipe mostly matches the user's query with minor deviations or substitutions that still align with the user's dietary restrictions and preferences.
    2. Clarity: The instructions are clear and easy to follow, with only minor areas that could benefit from additional detail.
    3. Creativity: The recipe shows some creativity and presents an appealing dish, though it may not be as unique as a 5-rated recipe.
    4. Completeness: The recipe includes most necessary details, but might miss a few minor tips or variations.
    5. Healthiness: The recipe is generally healthy, though it may not be as nutritionally balanced as a 5-rated recipe.
    6. User Feedback: The recipe is likely to receive good ratings from users, being tasty and reasonably easy to prepare.

3 - Good Recipe:

    1. Accuracy: The recipe has a reasonable match with the user's query but may include some inaccuracies or ingredient substitutions that slightly alter the dish's nature.
    2. Clarity: The instructions are generally clear but may have a few confusing steps or lack detailed guidance in some areas.
    3. Creativity: The recipe is standard with minimal creativity or uniqueness.
    4. Completeness: The recipe includes the essential details but lacks additional helpful information or suggestions.
    5. Healthiness: The recipe is moderately healthy but may lack balance in terms of nutritional profile.
    6. User Feedback: The recipe is expected to receive average ratings, being satisfactory but not outstanding in taste or ease of preparation.

2 - Fair Recipe:

    1. Accuracy: The recipe has noticeable discrepancies from the user's query, potentially including ingredients that were supposed to be excluded due to dietary restrictions.
    2. Clarity: The instructions are unclear or difficult to follow, with significant gaps or ambiguities.
    3. Creativity: The recipe lacks creativity and may appear bland or uninspired.
    4. Completeness: The recipe is missing several important details, such as precise measurements or key preparation steps.
    5. Healthiness: The recipe is not particularly healthy and may have an unbalanced nutritional profile.
    6. User Feedback: The recipe is likely to receive below-average ratings due to issues with taste, clarity, or preparation difficulty.

1 - Poor Recipe:

    1. Accuracy: The recipe significantly deviates from the user's query, ignoring key dietary restrictions or preferences.
    2. Clarity: The instructions are confusing, incomplete, or incorrect, making the recipe difficult or impossible to follow.
    3. Creativity: The recipe is not creative and may seem haphazard or poorly thought out.
    4. Completeness: The recipe is missing critical details, such as major ingredients, steps, or cooking times.
    5. Healthiness: The recipe is unhealthy and lacks a balanced nutritional profile.
    6. User Feedback: The recipe is likely to receive low ratings due to poor taste, difficulty in preparation, or failure to meet user expectations.

In [None]:
# Eval framework
MODEL_ID = "anthropic.claude-3-haiku-20240307-v1:0"

def query_bedrock_eval_llm(messages):
    response = bedrock_client.invoke_model(
        modelId=MODEL_ID,
        body=json.dumps({
            'anthropic_version': 'bedrock-2023-05-31', # This is required to use chat style messages object 
            'messages': messages,
            'max_tokens': 3000,

            # This config forces the model to always call the recipe db query tool atleast once 
            # https://docs.anthropic.com/en/docs/build-with-claude/tool-use#controlling-claudes-output
            # "tool_choice": {
            #     "type": "tool",
            #     "name": recipe_db_query_tool['name']
            # },
            
            # TODO: TUNE THESE VALUES
            'temperature': 0.1, 
            'top_p': 0.9,
            'top_k' : 2,
        })
    )
    response_body = json.loads(response.get('body').read())
    
    return response_body

In [None]:
qdrant_rag_chain_eval = (
    RunnableMap(
        # {"context": qdrant_retriever | format_docs,
         {"query": RunnablePassthrough()}
    )
    | process_prompt
    | query_bedrock_eval_llm
    # | parse_event_stream
)

In [None]:
def llm_judge_eval(queries_index_list, model_responses_dict):
    
    # eval_responses = copy.deepcopy(model_responses_dict)
    
    for model_name, response_dict in model_responses_dict.items():
        for i in queries_index_list:
            query = test_queries[i]
            response = response_dict[i]

            eval_message = f"""
            You are a helpful assistant and expert in reviewing cooking recipes.

            Please look at the given user query, recipe generated by another LLM and follow the rubric below. Provide a score between 1-5 on how good the recipe is.

            user query : "{query}"

            Recipe generated by another LLM: "{response}"

            Recipe review rubric: 

            Grading Scale (1-5)

            5 - Exceptional Recipe:

            1. Accuracy: The recipe is highly accurate and closely matches the user's query, including all specified ingredients, dietary restrictions, and desired cuisine type.
            2. Clarity: The instructions are clear, easy to follow, and logically sequenced. Cooking times and temperatures are precise.
            3. Creativity: The recipe demonstrates creativity, offering a unique or interesting twist on a classic dish or a novel combination of ingredients.
            4. Completeness: The recipe includes all necessary details, such as ingredient measurements, preparation steps, serving suggestions, and any relevant tips or variations.
            5. Healthiness: The recipe provides a balanced nutritional profile, aligning with any specified health goals or dietary considerations.
            6. User Feedback: The recipe is likely to receive high ratings from users for both taste and ease of preparation.
            4 - Very Good Recipe:

            1. Accuracy: The recipe mostly matches the user's query with minor deviations or substitutions that still align with the user's dietary restrictions and preferences.
            2. Clarity: The instructions are clear and easy to follow, with only minor areas that could benefit from additional detail.
            3. Creativity: The recipe shows some creativity and presents an appealing dish, though it may not be as unique as a 5-rated recipe.
            4. Completeness: The recipe includes most necessary details, but might miss a few minor tips or variations.
            5. Healthiness: The recipe is generally healthy, though it may not be as nutritionally balanced as a 5-rated recipe.
            6. User Feedback: The recipe is likely to receive good ratings from users, being tasty and reasonably easy to prepare.
            3 - Good Recipe:

            1. Accuracy: The recipe has a reasonable match with the user's query but may include some inaccuracies or ingredient substitutions that slightly alter the dish's nature.
            2. Clarity: The instructions are generally clear but may have a few confusing steps or lack detailed guidance in some areas.
            3. Creativity: The recipe is standard with minimal creativity or uniqueness.
            4. Completeness: The recipe includes the essential details but lacks additional helpful information or suggestions.
            5. Healthiness: The recipe is moderately healthy but may lack balance in terms of nutritional profile.
            6. User Feedback: The recipe is expected to receive average ratings, being satisfactory but not outstanding in taste or ease of preparation.
            2 - Fair Recipe:

            1. Accuracy: The recipe has noticeable discrepancies from the user's query, potentially including ingredients that were supposed to be excluded due to dietary restrictions.
            2. Clarity: The instructions are unclear or difficult to follow, with significant gaps or ambiguities.
            3. Creativity: The recipe lacks creativity and may appear bland or uninspired.
            4. Completeness: The recipe is missing several important details, such as precise measurements or key preparation steps.
            5. Healthiness: The recipe is not particularly healthy and may have an unbalanced nutritional profile.
            6. User Feedback: The recipe is likely to receive below-average ratings due to issues with taste, clarity, or preparation difficulty.
            1 - Poor Recipe:

            1. Accuracy: The recipe significantly deviates from the user's query, ignoring key dietary restrictions or preferences.
            2. Clarity: The instructions are confusing, incomplete, or incorrect, making the recipe difficult or impossible to follow.
            3. Creativity: The recipe is not creative and may seem haphazard or poorly thought out.
            4. Completeness: The recipe is missing critical details, such as major ingredients, steps, or cooking times.
            5. Healthiness: The recipe is unhealthy and lacks a balanced nutritional profile.
            6. User Feedback: The recipe is likely to receive low ratings due to poor taste, difficulty in preparation, or failure to meet user expectations.

            The response format should be the following :

            User Query : 
            Recipe reviewed : 
            Score : 
            Reasoning : 
                1. Accuracy:
                2. Clarity :
                3. Creativity :
                4. Completeness : 
                5. Healthiness : 
                6. User Feedback :
            """

            # response = qdrant_rag_chain_eval.invoke(eval_message)['content'][0]['text']
            response = qdrant_rag_chain_eval.invoke(eval_message)
            model_responses
            print(f'Model: {model_name}')
            print(response)
            print("========\n")
            return 
    

In [None]:
def model_testing(query_index, model_dict):
    """
    query_index: list of indices to pull from test_queries dict
    model_dict: expected input {model_name: (model_object, boolean for function calling)}
    return dict {model_name_1: {1: response_1, 2: response_2, etc}}
    """
    # instantiate dict {model name: {1:response, 2:response2, etc}}
    model_responses = {model_name: {i: '' for i in query_index} for model_name in model_dict.keys()}
    for model_name, (model, func_call) in models_to_test.items():
        for i in queries_test_index:
            query = test_queries[i]

            if func_call:
                response = run_chat_loop(query, model) 
            else: 
                response = model.invoke(query)

            model_responses[model_name][i] = response
    return model_responses

In [None]:
# Model name: (model object, using run_chat_loop function calling)
models_to_test = {
    'baseline': (qdrant_rag_chain_basic, False),
    'rerank': (qdrant_rag_chain_rerank, False),
    'func_call_rerank': (compression_retriever, True)
}

# indexes to test from test_queries dictionary
queries_test_index = ['1', '2']

model_responses = model_testing(queries_test_index, models_to_test)

In [None]:
model_responses

In [None]:
llm_judge_eval(queries_test_index, model_responses)