### Ingestion

In [1]:
# Import the datafile
import requests
data_url = 'https://raw.githubusercontent.com/eadka/fridgechef/main/Data/RecipeData.json'
data_response = requests.get(data_url)
recipes_data = data_response.json()

In [2]:
# Ensuring all the data has strings because minsearch, under the hood uses TfidfVectorizer and expects each text_field to be a string
for recipe in recipes_data:
    for field in ["dish_name",  "cuisine",  "diet", "tags",  "main_ingredients", 
                 "cooking_time_minutes", "difficulty",  "ingredients_full", 
                 "instructions", "substitutions", "flavor_notes"]:
        value = recipe.get(field, "")
        if isinstance(value,list):
            recipe[field] = " ".join(map(str,value)) # join the list into string
        elif not isinstance(value, str):
            recipe[field] = str(value) # convert numbers to string

In [3]:
# Search engine and indexing
import minsearch

# Indexing the document
index = minsearch.Index(
    text_fields=["dish_name",  "cuisine",  "diet", "tags",  "main_ingredients", 
                 "cooking_time_minutes", "difficulty",  "ingredients_full", 
                 "instructions", "substitutions", "flavor_notes"],
    keyword_fields=[]
)

In [4]:
index.fit(recipes_data)

<minsearch.minsearch.Index at 0x75fba6361df0>

In [5]:
query = 'Give me recipes for carrots and beans'

In [6]:
index.search(query,num_results=2)

[{'dish_name': 'Rajma Masala',
  'cuisine': 'Indian',
  'diet': 'Vegan',
  'tags': 'protein-rich curry comfort food',
  'main_ingredients': 'kidney beans onion tomato ginger garam masala',
  'cooking_time_minutes': '45',
  'difficulty': 'Medium',
  'ingredients_full': "{'item': 'kidney beans', 'quantity': '2 cups cooked'} {'item': 'onion', 'quantity': '1 large'} {'item': 'tomato', 'quantity': '2'} {'item': 'ginger', 'quantity': '1 inch'} {'item': 'garam masala', 'quantity': '1 tsp'}",
  'instructions': 'Sauté onion and ginger until golden. Add tomato and spices, cook until soft. Add kidney beans, simmer for 20 minutes.',
  'substitutions': "{'kidney beans': ['black beans']}",
  'flavor_notes': 'Rich, spiced, and hearty.'},
 {'dish_name': 'Minestrone Soup',
  'cuisine': 'Italian',
  'diet': 'Vegan',
  'tags': 'soup hearty vegetable-rich',
  'main_ingredients': 'carrot celery zucchini beans pasta tomato',
  'cooking_time_minutes': '40',
  'difficulty': 'Easy',
  'ingredients_full': "{'it

### RAG Flow

In [7]:
# Open AI for LLM integration
from openai import OpenAI

client = OpenAI()

In [8]:
# response = client.chat.completions.create(
#     model='gpt-4o-mini',
#     messages=[{"role": "user", "content": query}]
# )

# response.choices[0].message.content

In [9]:
# Defining the RAG flow
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [10]:
recipes_data[0]

{'dish_name': 'Vegetable Pad Thai',
 'cuisine': 'Thai',
 'diet': 'Vegan',
 'tags': 'quick noodles stir-fry',
 'main_ingredients': 'rice noodles tofu carrot bean sprouts spring onions peanuts soy sauce lime garlic',
 'cooking_time_minutes': '25',
 'difficulty': 'Easy',
 'ingredients_full': "{'item': 'rice noodles', 'quantity': '200g'} {'item': 'tofu', 'quantity': '150g'} {'item': 'carrot', 'quantity': '1 medium'} {'item': 'bean sprouts', 'quantity': '1 cup'} {'item': 'spring onions', 'quantity': '2'} {'item': 'peanuts', 'quantity': '2 tbsp, crushed'} {'item': 'soy sauce', 'quantity': '3 tbsp'} {'item': 'lime', 'quantity': '1'} {'item': 'garlic', 'quantity': '2 cloves'}",
 'instructions': 'Soak rice noodles in warm water for 20 minutes. Stir-fry garlic and tofu until golden. Add vegetables and stir-fry for 2-3 minutes. Add noodles and sauce, toss until combined. Garnish with peanuts and lime.',
 'substitutions': "{'tofu': ['tempeh', 'chickpeas'], 'soy sauce': ['tamari', 'coconut aminos']

In [11]:
# prompt_template = """
# You're a "Fridge Chef", a helpful cooking assistant. 
# The user will give you a list of vegetables or ingredients they have available.
# Base your answer only on the recipes in the CONTEXT.
# If you cannot find an exact match, suggest the closest dishes using the available ingredients.

# When answering:
# - Include the dish name, cuisine, diet type, main ingredients, and cooking time.
# - Provide short cooking instructions based on the CONTEXT.
# - Suggest possible ingredient substitutions if given in the CONTEXT.
# - If multiple dishes fit, return the top 3–5 most relevant recipes.

# QUESTION: {question}

# CONTEXT: 
# {context}
# """.strip()

# entry_template = """
# dish_name: {dish_name}
# cuisine: {cuisine}
# diet: {diet}
# tags: {tags}
# main_ingredients: {main_ingredients}
# cooking_time_minutes: {cooking_time_minutes}
# difficulty: {difficulty}
# ingredients_full: {ingredients_full}
# instructions: {instructions}
# substitutions: {substitutions}
# flavor_notes: {flavor_notes}
# """.strip()

# def build_prompt(query, search_results):
#     context = ""
    
#     for doc in search_results:
#         context = context + entry_template.format(**doc) + "\n\n"
    
#     prompt = prompt_template.format(question=query, context=context).strip()
#     return prompt

In [12]:
prompt_template = """
You're a "Fridge Chef", a helpful cooking assistant. 
The user will give you a list of vegetables or ingredients they have available.
Base your answer only on the recipes in the CONTEXT.
If you cannot find an exact match, suggest the closest dishes using the available ingredients.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

entry_template = """
dish_name: {dish_name}
cuisine: {cuisine}
diet: {diet}
tags: {tags}
main_ingredients: {main_ingredients}
cooking_time_minutes: {cooking_time_minutes}
difficulty: {difficulty}
ingredients_full: {ingredients_full}
instructions: {instructions}
substitutions: {substitutions}
flavor_notes: {flavor_notes}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [13]:
search_results = search(query)
prompt = build_prompt(query, search_results)

In [14]:
print(prompt)

You're a "Fridge Chef", a helpful cooking assistant. 
The user will give you a list of vegetables or ingredients they have available.
Base your answer only on the recipes in the CONTEXT.
If you cannot find an exact match, suggest the closest dishes using the available ingredients.

QUESTION: Give me recipes for carrots and beans

CONTEXT: 
dish_name: Rajma Masala
cuisine: Indian
diet: Vegan
tags: protein-rich curry comfort food
main_ingredients: kidney beans onion tomato ginger garam masala
cooking_time_minutes: 45
difficulty: Medium
ingredients_full: {'item': 'kidney beans', 'quantity': '2 cups cooked'} {'item': 'onion', 'quantity': '1 large'} {'item': 'tomato', 'quantity': '2'} {'item': 'ginger', 'quantity': '1 inch'} {'item': 'garam masala', 'quantity': '1 tsp'}
instructions: Sauté onion and ginger until golden. Add tomato and spices, cook until soft. Add kidney beans, simmer for 20 minutes.
substitutions: {'kidney beans': ['black beans']}
flavor_notes: Rich, spiced, and hearty.

di

In [15]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [16]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer
rag(query)

'Based on the ingredients you provided, here are two recipes that feature carrots and beans:\n\n1. **Minestrone Soup** (Italian)\n   - **Cooking Time:** 40 minutes\n   - **Difficulty:** Easy\n   - **Ingredients:**\n     - 2 carrots\n     - 2 stalks celery\n     - 1 zucchini\n     - 1 cup cooked cannellini beans (you can substitute with kidney beans)\n     - 2 tomatoes\n     - 4 cups vegetable broth\n     - 1/2 cup small pasta\n   - **Instructions:**\n     1. Sauté carrot and celery for 5 minutes.\n     2. Add zucchini, tomatoes, beans, and vegetable broth. Simmer for 20 minutes.\n     3. Add pasta and cook until tender.\n\n   - **Flavor Notes:** Comforting, savory with a light herbal aroma.\n\n2. **Kung Pao Vegetables** (Chinese)\n   - **Cooking Time:** 25 minutes\n   - **Difficulty:** Medium\n   - **Ingredients:**\n     - 1 cup chopped carrots\n     - 1 cup chopped bell peppers\n     - 2 tbsp soy sauce\n     - 1/4 cup peanuts (optional)\n     - 2 cloves minced garlic\n   - **Instructi

In [17]:
answer = rag('What is the main cooking technique used in the Vegetable Pad Thai?')
print(answer)

The main cooking technique used in the Vegetable Pad Thai is stir-frying.


### Retrieval Evaluation

In [18]:
import pandas as pd

In [20]:
df_question = pd.read_csv('../Data/ground-truth-retrieval.csv')

In [37]:
df_question.head()

Unnamed: 0,id,question
0,Vegetable Pad Thai,What are the main ingredients used in the Vege...
1,Vegetable Pad Thai,How long does it take to cook the Vegetable Pa...
2,Vegetable Pad Thai,What can I use instead of tofu in the Vegetabl...
3,Vegetable Pad Thai,What type of cuisine does the Vegetable Pad Th...
4,Vegetable Pad Thai,Can you describe the flavor profile of the Veg...


In [36]:
df_question.describe()

Unnamed: 0,id,question
count,490,490
unique,98,486
top,Vegetable Pad Thai,Can I use tofu instead of paneer in this recipe?
freq,5,2


In [22]:
ground_truth = df_question.to_dict(orient='records')

In [23]:
ground_truth[0]

{'id': 'Vegetable Pad Thai',
 'question': 'What are the main ingredients used in the Vegetable Pad Thai recipe?'}

In [24]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [25]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [32]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['dish_name'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [33]:
from tqdm.auto import tqdm

In [34]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/490 [00:00<?, ?it/s]

{'hit_rate': 0.9755102040816327, 'mrr': 0.836089245221898}

### Finding the best parameters

In [35]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

  import pkg_resources


In [38]:
# Breaking the data into validation and test data sets
df_validation = df_question[:100]
df_test = df_question[100:]

In [39]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Using float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [40]:
gt_val = df_validation.to_dict(orient='records')

In [41]:
def minsearch_search(query,boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [48]:
# mrr
param_ranges = {
    'dish_name': (0.0,3.0),
    'cuisine': (0.0,3.0),
    'diet': (0.0,3.0),
    'tags': (0.0,3.0),
    'main_ingredients': (0.0,3.0),
    'cooking_time_minutes': (0.0,3.0),
    'difficulty': (0.0,3.0),
    'ingredients_full': (0.0,3.0)
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [49]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'dish_name': 2.503759848167998,
  'cuisine': 2.3917649388518845,
  'diet': 0.6117659155729684,
  'tags': 0.13785745660655768,
  'main_ingredients': 1.155933114892237,
  'cooking_time_minutes': 1.3302286237468963,
  'difficulty': 2.5913095470354444,
  'ingredients_full': 2.570509594277529},
 0.9348611111111111)

In [44]:
# mrr
def minsearch_improved(query):
    boost = {'dish_name': 2.49,
        'cuisine': 2.16,
        'diet': 2.745,
        'tags': 0.23,
        'main_ingredients': 1.631,
        'cooking_time_minutes': 0.39,
        'difficulty': 2.64,
        'ingredients_full': 1.73
        } 

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/490 [00:00<?, ?it/s]

{'hit_rate': 0.9734693877551021, 'mrr': 0.9050413022351798}