### Install dependencies

In [1]:
!pip install boto3 sagemaker langchain langchain-community langchain-core faiss-cpu requests opensearch-py sentence-transformers langchain-text-splitters requests-aws4auth qdrant-client -U



### Util to fetch api key from AWS Secrets Manager

In [1]:
import boto3
import json

def get_secret(secret_name, is_json=False, region_name='us-east-1'):
    client = boto3.client('secretsmanager', region_name=region_name)

    try:
        get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    except Exception as e:
        raise e

    secret = get_secret_value_response['SecretString']

    if is_json:
        return json.loads(secret)
    else:
        return secret

### Load CSV data from S3

In [2]:
!pwd

/home/reson/Documents/GitHub/Enterprise-RAG/notebooks


In [3]:
import boto3
import pandas as pd

s3 = boto3.client('s3')
bucket_name = 'recipes-rag'

In [4]:
file_key = 'recipes_w_cleaning_time_combined_features.parquet'
s3.download_file(bucket_name, file_key, f'../data/{file_key}')
df = pd.read_parquet(f'../data/{file_key}')

df.head()

Unnamed: 0,Name,RecipeCategory,Description,Keywords_string,RecipeIngredientQuantities,RecipeIngredientParts,RecipeInstructions,AggregatedRating,ReviewCount,CookTime_Minutes,PrepTime_Minutes,TotalTime_Minutes,Combined_Features,Combined_Features_Clean
0,Low-Fat Berry Blue Frozen Dessert,Frozen Desserts,Make and share this Low-Fat Berry Blue Frozen ...,Dessert Low Protein Low Cholesterol Healthy Fr...,"[4, 1⁄4, 1, 1]","[blueberries, granulated sugar, vanilla yogurt...",Toss 2 cups berries with sugar. Let stand for ...,4.5,4.0,1440,45,1485,Low-Fat Berry Blue Frozen Dessert Frozen Desse...,Low-Fat Berry Blue Frozen Dessert Frozen Desse...
1,Biryani,Chicken Breast,Make and share this Biryani recipe from Food.com.,Chicken Thigh & Leg Chicken Poultry Meat Asian...,"[1, 4, 2, 2, 8, 1⁄4, 8, 1⁄2, 1, 1, 1⁄4, 1⁄4, 1...","[saffron, milk, hot green chili peppers, onion...",Soak saffron in warm milk for 5 minutes and pu...,3.0,1.0,25,240,265,Biryani Chicken Breast Make and share this Bir...,Biryani Chicken Breast Make share Biryani reci...
2,Best Lemonade,Beverages,This is from one of my first Good House Keepi...,Low Protein Low Cholesterol Healthy Summer < 6...,"[1 1⁄2, 1, None, 1 1⁄2, None, 3⁄4]","[sugar, lemons, rind of, lemon, zest of, fresh...","Into a 1 quart Jar with tight fitting lid, put...",4.5,10.0,5,30,35,Best Lemonade Beverages This is from one of my...,Best Lemonade Beverages one first Good House K...
3,Carina's Tofu-Vegetable Kebabs,Soy/Tofu,This dish is best prepared a day in advance to...,Beans Vegetable Low Cholesterol Weeknight Broi...,"[12, 1, 2, 1, 10, 1, 3, 2, 2, 2, 1, 2, 1⁄2, 1⁄...","[extra firm tofu, eggplant, zucchini, mushroom...","Drain the tofu, carefully squeezing out excess...",4.5,2.0,20,1440,1460,Carina's Tofu-Vegetable Kebabs Soy/Tofu This d...,Carina's Tofu-Vegetable Kebabs Soy/Tofu dish b...
4,Cabbage Soup,Vegetable,Make and share this Cabbage Soup recipe from F...,Low Protein Vegan Low Cholesterol Healthy Wint...,"[46, 4, 1, 2, 1]","[plain tomato juice, cabbage, onion, carrots, ...",Mix everything together and bring to a boil. R...,4.5,11.0,30,20,50,Cabbage Soup Vegetable Make and share this Cab...,Cabbage Soup Vegetable Make share Cabbage Soup...


In [5]:
df = df.dropna(subset=["AggregatedRating"])

In [6]:
df.columns

Index(['Name', 'RecipeCategory', 'Description', 'Keywords_string',
       'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'RecipeInstructions', 'AggregatedRating', 'ReviewCount',
       'CookTime_Minutes', 'PrepTime_Minutes', 'TotalTime_Minutes',
       'Combined_Features', 'Combined_Features_Clean'],
      dtype='object')

In [7]:
str(df.iloc[0]['Combined_Features_Clean'])

"Low-Fat Berry Blue Frozen Dessert Frozen Desserts Make share Low-Fat Berry Blue Frozen Dessert recipe Food.com. Toss 2 cups berries sugar. Let stand 45 minutes, stirring occasionally. Transfer berry-sugar mixture food processor. Add yogurt process smooth. Strain fine sieve. Pour baking pan (or transfer ice cream maker process according manufacturers' directions). Freeze uncovered edges solid centre soft. Transfer processor blend smooth again. Return pan freeze edges solid. Transfer processor blend smooth again. Fold remaining 2 cups blueberries. Pour plastic mold freeze overnight. Let soften slightly serve. Dessert Low Protein Low Cholesterol Healthy Free Of... Summer Weeknight Freezer Easy"

### Load data into  documents

In [8]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import json

In [9]:
embedding_model = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
  return torch._C._cuda_getDeviceCount() > 0


In [10]:
def create_documents(df):
    df_copy = df.copy(deep=True)
    df = df.dropna(subset=["AggregatedRating"])
    df_copy = df_copy.fillna("")  # Convert NA values to empty strings
    df_copy = df_copy.astype(str)  # Cast all columns to string

    documents = []
    for _index, row in df_copy.iterrows():
        metadata = {
            "name": row["Name"] if row["Name"] else "No Name Available",
            "description": (
                row["Description"] if row["Description"] else "No Description Available"
            ),
            "recipe_category": (
                row["RecipeCategory"]
                if row["RecipeCategory"]
                else "No Category Available"
            ),
            "keywords": (
                row["Keywords_string"]
                if row["Keywords_string"]
                else "No Keywords Available"
            ),
            "recipe_ingredient_parts": (
                row["RecipeIngredientParts"]
                if row["RecipeIngredientParts"]
                else "No Recipe Ingredient Parts Available"
            ),
            "recipe_instructions": (
                row["RecipeInstructions"]
                if row["RecipeInstructions"]
                else "No Recipe Instructions Available"
            ),
            "aggregated_rating": (
                row["AggregatedRating"]
                if row["AggregatedRating"]
                else "No Rating Available"
            ),
            "review_count": (
                row["ReviewCount"] if row["ReviewCount"] else "No Reviews Available"
            ),
        }

        # List of fields to be included in the document content
        content_field = (
            row["Combined_Features"]
            if row["Combined_Features"]
            else "No Content Available"
        )

        # Create the document content using the combined features field
        doc = Document(page_content=content_field, metadata=metadata)
        documents.append(doc)

    return documents

In [18]:
max_num_docs = 100
df_subset = df.sample(n=max_num_docs)
df_subset.shape

(100, 14)

In [19]:
documents = create_documents(df_subset)

In [20]:
documents[0]

Document(metadata={'name': 'picadillo', 'description': 'Make and share this picadillo recipe from Food.com.', 'recipe_category': 'Meat', 'keywords': 'Low Cholesterol Kosher < 60 Mins Stove Top', 'recipe_ingredient_parts': "['olive oil' 'ground beef' 'onions' 'chili powder' 'pumpkin pie spice'\n 'diced tomatoes' 'chicken broth' 'raisins' 'green olives' 'cider vinegar']", 'recipe_instructions': 'heat oil in skillet, add beef and onion, cook until browned, breaking up big chunks, drain off fat. add chili powder and pumpkin pie spice, stir well. add tomatoes, broth, raisins and olives, and vinegar. simmer for 15 minutes until thickened and flavors are blended. season to taste with salt& pepper, garnish with almonds.', 'aggregated_rating': '4.5', 'review_count': '5.0'}, page_content='picadillo Meat Make and share this picadillo recipe from Food.com. heat oil in skillet, add beef and onion, cook until browned, breaking up big chunks, drain off fat. add chili powder and pumpkin pie spice, sti

In [21]:
documents[0].metadata

{'name': 'picadillo',
 'description': 'Make and share this picadillo recipe from Food.com.',
 'recipe_category': 'Meat',
 'keywords': 'Low Cholesterol Kosher < 60 Mins Stove Top',
 'recipe_ingredient_parts': "['olive oil' 'ground beef' 'onions' 'chili powder' 'pumpkin pie spice'\n 'diced tomatoes' 'chicken broth' 'raisins' 'green olives' 'cider vinegar']",
 'recipe_instructions': 'heat oil in skillet, add beef and onion, cook until browned, breaking up big chunks, drain off fat. add chili powder and pumpkin pie spice, stir well. add tomatoes, broth, raisins and olives, and vinegar. simmer for 15 minutes until thickened and flavors are blended. season to taste with salt& pepper, garnish with almonds.',
 'aggregated_rating': '4.5',
 'review_count': '5.0'}

In [22]:
documents[0].page_content

'picadillo Meat Make and share this picadillo recipe from Food.com. heat oil in skillet, add beef and onion, cook until browned, breaking up big chunks, drain off fat. add chili powder and pumpkin pie spice, stir well. add tomatoes, broth, raisins and olives, and vinegar. simmer for 15 minutes until thickened and flavors are blended. season to taste with salt& pepper, garnish with almonds. Low Cholesterol Kosher < 60 Mins Stove Top'

In [23]:
len(documents)

100

In [24]:
from langchain_community.vectorstores import Qdrant

qdrant_store = Qdrant.from_documents(documents,
    embedding_model,
    location=":memory:",
)

In [25]:
qdrant_retriever = qdrant_store.as_retriever()

In [26]:
# Converts a list of document objects into a string with its metadata
def format_docs(docs):
    formatted_docs = []
    excluded_columns = ["name", "recipe_category", "description"]

    for doc in docs:
        doc_content = doc.page_content
        metadata_content = "\n".join(
            f"{key}: {value}"
            for key, value in doc.metadata.items()
            if key not in excluded_columns and value != "No Data Available"
        )
        formatted_doc = f"{doc_content}\n\nMetadata:\n{metadata_content}"
        formatted_docs.append(formatted_doc)

    content = "\n\n---\n\n".join(formatted_docs)
    return content


### Agent function definitions

In [27]:
recipe_db_query_tool = {
  "name": "query_food_recipe_vector_db",
  "description": """
  Queries the vector database containing food recipes to retrieve the most relevant documents. 
  This function allows the model to generate and execute multiple queries as necessary to gather comprehensive context,
  ensuring accurate and thorough responses to user queries specifically related to recipes.
  Each recipe document includes details such as name, description, recipe category, ingredients, instructions, cook time, 
  community rating, and review count
  """,
  "input_schema": {
    "type": "object",
    "properties": {
      "queries": {
        "type": "array",
        "items": {
          "type": "string",
          "description": "A query generated by the model to run against the vector database to fetch recipe documents. This should be used to fetch specific recipes, ingredients, preparation steps, and related metadata."
        },
        "description": "A list of queries generated by the model to run against the vector database to fetch recipe documents."
      }
    },
    "required": ["queries"]
  }
}


In [28]:
google_web_search_tool = {
  "name": "google_web_search",
  "description": """
  Queries the Google web search engine to retrieve relevant information from the web. 
  This function allows the model to search for information it is unsure about, such as 
  ingredient substitutions, cooking techniques, nutritional information, or other specific
  knowledge not contained within the recipe vector database, providing accurate and up-to-date 
  answers to user queries.
  """,
  "input_schema": {
    "type": "object",
    "properties": {
      "queries": {
        "type": "array",
        "items": {
          "type": "string",
          "description": "A query generated by the model to search on Google for relevant information, particularly for details outside the scope of the recipe vector database, such as ingredient substitutions or cooking techniques."
        },
        "description": "A list of queries generated by the model to search on Google for relevant information."
      }
    },
    "required": ["queries"]
  }
}


### Init bedrock model, define util to stateless messaging, no fn calling

In [60]:
bedrock_client = boto3.client('bedrock-runtime', region_name="us-east-1")

In [84]:
baseline_sys_prompt = """
You are a helpful assistant and expert in cooking recipes.

Before answering, follow these requirements:

- Always make at least one call to query_food_recipe_vector_db to retrieve the relevant context of recipes and ingredients to generate an informed and high-quality response to the user prompt, specifically for retrieving recipes, ingredients, preparation steps, and related metadata.

- NEVER exceed a MAXIMUM of 3 calls to the query_food_recipe_vector_db function.

- If you encounter a query related to ingredient substitutions, preparation techniques, nutritional information, or other specific knowledge not contained within the recipe database, make a call to the google_web_search function to look up relevant information.

- Do not use the google_web_search function to look up entire recipes. It should only be used for supplementary information not found in the recipe database.

- Analyze the user's requirements and NEVER provide a recipe that violates ANY of the user's requirements.

- In your final response, NEVER include any XML tags with information about your thoughts. It is okay to include XML and analysis text in any message except your final one with the recipes and instructions.

Provide a response to the user prompt about food with recommended recipes and instructions.
"""

In [85]:
MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0"

def query_bedrock_llm(messages):
    response = bedrock_client.invoke_model(
        modelId=MODEL_ID,
        body=json.dumps({
            'anthropic_version': 'bedrock-2023-05-31', # This is required to use chat style messages object 
            'system': baseline_sys_prompt,
            'messages': messages,
            'max_tokens': 3000,
            "tools": [recipe_db_query_tool, google_web_search_tool],

            # This config forces the model to always call the recipe db query tool atleast once 
            # https://docs.anthropic.com/en/docs/build-with-claude/tool-use#controlling-claudes-output
            # "tool_choice": {
            #     "type": "tool",
            #     "name": recipe_db_query_tool['name']
            # },
            
            # TODO: TUNE THESE VALUES
            'temperature': 0.1, 
            'top_p': 0.9
        })
    )
    response_body = json.loads(response.get('body').read())
    
    return response_body

### Implement continuous dialogue and function calling

In [86]:
def generate_message(prompt):
    if type(prompt) != str:
        raise ValueError(f'Tried to call message generate_message with non-string input: {prompt}')
        
    return {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": prompt
            }
        ]
    }

In [87]:
def generate_tool_message(fn_results):
    
    return {
        "role": "user",
        "content": fn_results
    }

In [88]:
import json

# Adds the current prompt as a new message to the chat history
# and calls bedrock with the entire chat history
# Returns the response body, llm's message, and new chat history

'''
Example response body structure:
{
   "id":"msg_bdrk_01C5GGkafK7aL3P5i3rsMr1p",
   "type":"message",
   "role":"assistant",
   "model":"claude-3-sonnet-20240229",
   "content":[
      {
         "type":"tool_use",
         "id":"toolu_bdrk_01CQiYa8BMJfpJC68DuRdwQn",
         "name":"query_food_recipe_vector_db",
         "input":{
            "queries":[
               "healthy fish dinner recipe under 500 calories",
               "fish dinner recipe for two under 40 minutes"
            ]
         }
      }
   ],
   "stop_reason":"tool_use",
   "stop_sequence":"None",
   "usage":{
      "input_tokens":559,
      "output_tokens":55
   }
}
'''
def message_handler(existing_chat_history, prompt, is_tool_message=False):
    # Fn results is an array of tool response objects
    # message structure needs to reflect that
    if is_tool_message:
        user_message = generate_tool_message(prompt)
    else:
        user_message = generate_message(prompt)
    existing_chat_history.append(user_message)

    # Parse the response content
    response_body = query_bedrock_llm(existing_chat_history)
    llm_message = {
        'role': response_body['role'],
        'content': response_body['content']
    }

    # Add the response message to the chat history
    existing_chat_history.append(llm_message)
    
    return [response_body, llm_message, existing_chat_history]

In [89]:
# Executes a list of queries and returns a list of document results
def handle_vector_db_queries(queries, retriever=qdrant_retriever): 
    context_docs = []
    for query in queries:
        query_results = retriever.invoke(query)
        context_docs.extend(query_results)

    return context_docs

In [90]:
import requests

"""
Executes a list of queries using the Google Custom Search API and returns a list of search results.

Returns:
A dictionary where each query maps to a list of search results. Each search result contains the title, and snippet.
"""
def handle_google_web_search(queries, api_key=get_secret('google_search_api_key'), cse_id='b20454e29b3b14095', num_results=10):
    # The base URL for the Google Custom Search API
    url = 'https://www.googleapis.com/customsearch/v1'
    
    all_search_results = {}
    
    for query in queries:
        params = {
            'key': api_key,
            'cx': cse_id,
            'q': query,
            'num': num_results
        }
        response = requests.get(url, params=params)
        
        # Raise an exception if the request was unsuccessful
        response.raise_for_status()
        
        # Parse the JSON response
        search_results = response.json()
        
        # Extract the relevant information from the search results
        results = []
        for item in search_results.get('items', []):
            result = {
                'title': item.get('title'),
                # 'url': item.get('link'),
                'snippet': item.get('snippet')
            }
            results.append(result)
        
        # Map the query to its search results
        all_search_results[query] = results
    
    return all_search_results


In [91]:
# Takes as an argument to LLM message content, returns a list of the fn result objects
google_search_api_key = get_secret('google_search_api_key')

def handle_function_calls(tool_call_message_content):
    tool_results = []
    
    for tool_call in tool_call_message_content:
        # Only process messages from the LLM that are function calls
        if tool_call['type'] != 'tool_use':
            continue
        fn_id = tool_call['id']
        fn_name = tool_call['name']
        fn_args = tool_call['input']
        fn_result = {
            "type": "tool_result",
            "tool_use_id": fn_id,
        }   

        if fn_name == 'query_food_recipe_vector_db':
            if 'queries' not in fn_args:
                print(f"ERROR: Tried to call {fn_name} with invalid args {fn_args}, skipping..")
                fn_result['content'] = ""
                fn_result['is_error'] = True
                tool_results.append(fn_result)
                continue
                
            print(f"Model called {fn_name} with args {fn_args}")
            context_docs = handle_vector_db_queries(fn_args['queries'])
            context_str = format_docs(context_docs)
            fn_result['content'] = context_str
            tool_results.append(fn_result)
            
        # TODO: handle web search invocation here
        elif fn_name =='google_web_search':
            if 'queries' not in fn_args:
                print(f"ERROR: Tried to call {fn_name} with invalid args {fn_args}, skipping..")
                fn_result['content'] = ""
                fn_result['is_error'] = True
                tool_results.append(fn_result)
                continue
                
            print(f"Model called {fn_name} with args {fn_args}")
            search_results = handle_google_web_search(fn_args['queries'], google_search_api_key)
            search_results_str = json.dumps(search_results)
            fn_result['content'] = search_results_str
            tool_results.append(fn_result)  
        
        else:
            print(f"ERROR: Attempted call to unknown function {fn_name}")
            fn_result['content'] = ""
            fn_result['is_error'] = True
            tool_results.append(fn_result)

    return tool_results

### Run dynamic query function calling with test queries

In [92]:
'''
Example payload structure of response_body:

{'id': 'msg_bdrk_01REesjegNiLteurBoxW7pSt',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-3-sonnet-20240229',
 'content': [{'type': 'tool_use',
   'id': 'toolu_bdrk_01191W2FuAFTRoDqKKeJSmmn',
   'name': 'query_food_recipe_vector_db',
   'input': {'queries': ['thai food',
     'peanut free',
     'low carb',
     'not spicy']}}],
 'stop_reason': 'tool_use',
 'stop_sequence': None,
 'usage': {'input_tokens': 573, 'output_tokens': 52}}


Example payload structure of llm_message['content']:

[{'type': 'tool_use',
   'id': 'toolu_bdrk_01191W2FuAFTRoDqKKeJSmmn',
   'name': 'query_food_recipe_vector_db',
   'input': {'queries': ['thai food',
     'peanut free',
     'low carb',
     'not spicy']}}]
'''

# This function is the entry point to invoke the LLM with support for function calling
# parsing output, calling requested functions, sending output is handled here
def run_chat_loop(prompt):
    print(f"[User]: {prompt}")
    response_body, llm_message, chat_history = message_handler(existing_chat_history=[], prompt=prompt)
    
    
    # The model wants to call tools, call them, provide response, repeat until content is generated
    while response_body['stop_reason'] == 'tool_use':
        fn_results = handle_function_calls(tool_call_message_content=llm_message['content'])

        # Send function results back to LLM as a new message with the existing chat history
        response_body, llm_message, chat_history = message_handler(
            existing_chat_history=chat_history, 
            prompt=fn_results,
            is_tool_message=True
        )

    # The model is done calling tools
    print(f"\n[Model]: {llm_message['content'][0]['text']}")


### Run example queries that should trigger both tool calls

In [94]:
run_chat_loop("Recipe for beef stew, and what wine pairs well with it?")

[User]: Recipe for beef stew, and what wine pairs well with it?
Model called query_food_recipe_vector_db with args {'queries': ['beef stew recipe']}
Model called google_web_search with args {'queries': ['what wine goes well with beef stew']}

[Model]: <search_quality_reflection>
The web search results provide helpful recommendations for wines that pair well with beef stew, including full-bodied reds like Cabernet Sauvignon, Merlot, Malbec, and Bordeaux blends. I now have enough information to provide a complete answer to the original query.
</search_quality_reflection>

<search_quality_score>5</search_quality_score>

<result>
Here is a recipe for classic beef stew, along with wine pairing recommendations:

Beef Stew Recipe:

Ingredients:
- 2 lbs beef chuck roast or stew meat, cut into 1-inch cubes 
- 3 tbsp olive oil
- 1 onion, diced 
- 3 carrots, sliced
- 3 celery stalks, sliced
- 3 cloves garlic, minced
- 3 tbsp tomato paste
- 1 cup red wine (like Cabernet Sauvignon or Merlot)
- 3 cu

In [96]:
run_chat_loop("How can I make a sugar-free chocolate cake, and what are good substitutes for sugar")

[User]: How can I make a sugar-free chocolate cake, and what are good substitutes for sugar
Model called query_food_recipe_vector_db with args {'queries': ['sugar-free chocolate cake', 'sugar substitutes for baking']}
Model called google_web_search with args {'queries': ['sugar substitutes for baking cakes and desserts']}

[Model]: Based on the recipe database results and web search information, here are some good tips for making a sugar-free chocolate cake and substituting sugar in baking:

- Use fruit purees like mashed bananas, applesauce, or prune puree to replace some or all of the sugar. The natural sweetness and moisture from the fruit can substitute for sugar.

- Try artificial or natural zero-calorie sweeteners like stevia, monk fruit sweetener, or erythritol. These provide sweetness without the calories of sugar. Start with less than the recipe calls for sugar as they can have a stronger sweetness.

- Use a combination of fruit purees and zero-calorie sweeteners to get the ri

In [97]:
run_chat_loop("Can you find a recipe for apple pie, and what can I use instead of butter to make it vegan?")

[User]: Can you find a recipe for apple pie, and what can I use instead of butter to make it vegan?
Model called query_food_recipe_vector_db with args {'queries': ['apple pie recipe']}
Model called google_web_search with args {'queries': ['vegan butter substitute for baking apple pie']}

[Model]: <search_quality_reflection>
The search results provide helpful information on using vegan butter substitutes like Earth Balance or coconut oil when baking vegan apple pies. I now have enough context to provide a full answer to the original query.
</search_quality_reflection>

<search_quality_score>5</search_quality_score>

<result>
Here is a recipe for vegan apple pie using vegan butter substitute:

Vegan Apple Pie

Ingredients:
- 2 pie crusts (use a vegan pie crust recipe with vegan butter substitute like Earth Balance or coconut oil instead of regular butter)
- 6-8 apples, peeled, cored and sliced 
- 3/4 cup sugar
- 2 tablespoons all-purpose flour
- 1 teaspoon ground cinnamon
- 1/4 teaspoon 