### Ingestion

In [1]:
# Import the datafile
import requests
data_url = 'https://raw.githubusercontent.com/eadka/fridgechef/main/Data/RecipeData.json'
data_response = requests.get(data_url)
recipes_data = data_response.json()

In [2]:
# Ensuring all the data has strings because minsearch, under the hood uses TfidfVectorizer and expects each text_field to be a string
for recipe in recipes_data:
    for field in ["dish_name",  "cuisine",  "diet", "tags",  "main_ingredients", 
                 "cooking_time_minutes", "difficulty",  "ingredients_full", 
                 "instructions", "substitutions", "flavor_notes"]:
        value = recipe.get(field, "")
        if isinstance(value,list):
            recipe[field] = " ".join(map(str,value)) # join the list into string
        elif not isinstance(value, str):
            recipe[field] = str(value) # convert numbers to string

In [3]:
# Open AI for LLM integration
from openai import OpenAI

client = OpenAI()

In [4]:
prompt_template = """
Emulate you are the user of the fridge chef appplication.
Formulate 5 questions this user might ask based on provided recipes.
Make the questions specific to these recipes.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as few words as possible from the record.

The record:
dish_name: {dish_name}
cuisine: {cuisine}
diet: {diet}
tags: {tags}
main_ingredients: {main_ingredients}
cooking_time_minutes: {cooking_time_minutes}
difficulty: {difficulty}
ingredients_full: {ingredients_full}
instructions: {instructions}
substitutions: {substitutions}
flavor_notes: {flavor_notes}

Provide the output in parsable JSON format without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [5]:
prompt = prompt_template.format(**recipes_data[0])
print(prompt)

Emulate you are the user of the fridge chef appplication.
Formulate 5 questions this user might ask based on provided recipes.
Make the questions specific to these recipes.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as few words as possible from the record.

The record:
dish_name: Vegetable Pad Thai
cuisine: Thai
diet: Vegan
tags: quick noodles stir-fry
main_ingredients: rice noodles tofu carrot bean sprouts spring onions peanuts soy sauce lime garlic
cooking_time_minutes: 25
difficulty: Easy
ingredients_full: {'item': 'rice noodles', 'quantity': '200g'} {'item': 'tofu', 'quantity': '150g'} {'item': 'carrot', 'quantity': '1 medium'} {'item': 'bean sprouts', 'quantity': '1 cup'} {'item': 'spring onions', 'quantity': '2'} {'item': 'peanuts', 'quantity': '2 tbsp, crushed'} {'item': 'soy sauce', 'quantity': '3 tbsp'} {'item': 'lime', 'quantity': '1'} {'item': 'garlic', 'quantity': '2 cloves'}
instructions: Soak rice no

In [6]:
# response = client.chat.completions.create(
#     model='gpt-4o-mini',
#     messages=[{"role": "user", "content": query}]
# )

# response.choices[0].message.content

In [7]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [8]:
questions = llm(prompt)

In [9]:
import json

In [10]:
json.loads(questions)

{'questions': ['How long should I soak the rice noodles before cooking?',
  'What ingredients do I need to make the Vegetable Pad Thai?',
  'Can I use chickpeas instead of tofu in this recipe?',
  'What is the cooking time for this dish?',
  'What flavor notes can I expect from the Vegetable Pad Thai?']}

In [11]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [12]:
from tqdm.auto import tqdm

In [13]:
results = {}

In [14]:
for doc in tqdm(recipes_data): 
    doc_id = doc['dish_name']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/99 [00:00<?, ?it/s]

In [15]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id,q))

In [16]:
final_results[0]

('Vegetable Pad Thai',
 'What are the main ingredients used in the Vegetable Pad Thai recipe?')

In [17]:
import pandas as pd

df_results = pd.DataFrame(final_results, columns = ['id','question'])

In [18]:
df_results.to_csv('../Data/ground-truth-retrieval.csv',index=False)

In [19]:
! head ../Data/ground-truth-retrieval.csv

id,question
Vegetable Pad Thai,What are the main ingredients used in the Vegetable Pad Thai recipe?
Vegetable Pad Thai,How long does it take to cook the Vegetable Pad Thai from start to finish?
Vegetable Pad Thai,What can I use instead of tofu in the Vegetable Pad Thai recipe?
Vegetable Pad Thai,What type of cuisine does the Vegetable Pad Thai belong to?
Vegetable Pad Thai,Can you describe the flavor profile of the Vegetable Pad Thai?
Chana Masala,What type of cuisine does Chana Masala belong to?
Chana Masala,How long does it take to cook Chana Masala?
Chana Masala,What are the main ingredients in Chana Masala?
Chana Masala,Can I substitute chickpeas with another ingredient?
