In [7]:
import pandas as pd

In [5]:
from openai import OpenAI
import os

client = OpenAI(
    api_key = os.environ.get("OPENAI_API_KEY"),
)

In [8]:
df = pd.read_csv('../data/data_clean.csv')
documents = df.to_dict(orient='records')

In [9]:
prompt_template = """
You emulate a user of our recipe assistant application.
Formulate 5 questions this user might ask based on a provided recipe.
Make the questions specific to this recipe.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

recipe_name: {recipe_name}
cuisine_type: {cuisine_type}
ingredients: {ingredients}
cooking_instructions: {cooking_instructions}
dietary_preferences: {dietary_preferences}
difficulty: {difficulty}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [10]:
prompt = prompt_template.format(**documents[0])

In [11]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [12]:
questions = llm(prompt)

In [13]:
import json

In [14]:
json.loads(questions)

{'questions': ['What type of cheese is used in Spaghetti Carbonara?',
  'How long should I cook the spaghetti?',
  'What should I do after frying the pancetta?',
  'Is there any dietary preference mentioned for this recipe?',
  'What is the difficulty level of making this dish?']}

In [15]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [16]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
results = {}

In [18]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:37<00:00,  1.57s/it]


In [19]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [20]:
final_results[0]

(0, 'What type of cheese is used in the Spaghetti Carbonara recipe?')

In [21]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [22]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [23]:
!head ../data/ground-truth-retrieval.csv

id,question
0,What type of cheese is used in the Spaghetti Carbonara recipe?
0,How do I know when the spaghetti is cooked al dente?
0,What is the first step in cooking the Spaghetti Carbonara?
0,Is there a specific type of meat recommended for this recipe?
0,Are there any dietary preferences mentioned for this dish?
1,What ingredients are needed for Chicken Tikka Masala?
1,What is the first step in the cooking instructions?
1,Can you tell me if Chicken Tikka Masala is suitable for gluten-free diets?
1,What is the difficulty level of preparing this recipe?
