In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

zsh:1: command not found: wget


In [57]:
import pandas as pd
import minsearch
import json
import os
import random
from openai import OpenAI
from tqdm.auto import tqdm

## Ingestion

In [10]:
df = pd.read_csv('../data/recipes.csv')
columns = ['Calories', 'FatContent', 'SaturatedFatContent',
                     'CholesterolContent', 'SodiumContent', 'CarbohydrateContent',
                     'FiberContent', 'SugarContent', 'ProteinContent']
for column in columns:
    df[column] = df[column].astype(str).apply(lambda x: x.lower())

In [11]:
documents = df.to_dict(orient='records')

In [12]:
index = minsearch.Index(
    text_fields=['Name', 'Description', 'RecipeInstructions', 'Calories', 'FatContent', 'SaturatedFatContent',
                     'CholesterolContent', 'SodiumContent', 'CarbohydrateContent',
                     'FiberContent', 'SugarContent', 'ProteinContent'],
    keyword_fields=['RecipeId']
)

In [13]:
index.fit(documents)

<minsearch.Index at 0x14a521be0>

In [17]:
os.environ["OPENAI_API_KEY"]=''

## RAG flow

In [18]:

client = OpenAI()

In [19]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [20]:
prompt_template = """
You're a diet plan assistant. Answer the QUESTION based on the CONTEXT from our recipes database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
Name: {Name}
Description: {Description}
RecipeInstructions: {RecipeInstructions}
Calories: {Calories}
FatContent: {FatContent}
SaturatedFatContent: {SaturatedFatContent}
CholesterolContent: {CholesterolContent}
SodiumContent: {SodiumContent}
CarbohydrateContent: {CarbohydrateContent}
FiberContent: {FiberContent}
SugarContent: {SugarContent}
ProteinContent: {ProteinContent}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [21]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [22]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [35]:
question = 'What is the total preparation time for the Low-Fat Berry Blue Frozen Dessert before it is ready to serve?'
answer = rag(question)
print(answer)

The total preparation time for the Low-Fat Berry Blue Frozen Dessert before it is ready to serve includes the following steps:

1. Toss 2 cups berries with sugar and let stand for 45 minutes.
2. Transfer berry-sugar mixture to a food processor and process with yogurt until smooth.
3. Strain through a fine sieve and pour into a baking pan (or transfer to ice cream maker).
4. Freeze uncovered until edges are solid but center is soft, then transfer to processor and blend until smooth again.
5. Return to pan and freeze until edges are solid, then transfer to processor and blend until smooth yet again.
6. Fold in remaining 2 cups of blueberries and pour into plastic mold to freeze overnight.

The detailed preparation time is as follows:
- 45 minutes for the berries to stand
- Approximately 1 hour for freezing (this is a rough estimate as the total freezing time until edges are solid is not explicitly mentioned)
- Overnight freezing time for the final product

So the total preparation time b

## Retrieval evaluation

In [48]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [49]:
df_question.head()

Unnamed: 0,RecipeId,question
0,38,What type of berries are needed for the Low-Fa...
1,38,How long should I let the berry and sugar mixt...
2,38,Is it necessary to strain the mixture after pr...
3,38,Can I use an ice cream maker instead of a baki...
4,38,How long do I need to freeze the dessert befor...


In [50]:
ground_truth = df_question.to_dict(orient='records')

In [51]:
ground_truth[0]

{'RecipeId': 38,
 'question': 'What type of berries are needed for the Low-Fat Berry Blue Frozen Dessert?'}

In [52]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [53]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [54]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['RecipeId']
        results = search_function(q)
        relevance = [d['RecipeId'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [55]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/850 [00:00<?, ?it/s]

{'hit_rate': 0.8247058823529412, 'mrr': 0.6911736694677868}

## Finding the best parameters

In [58]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [59]:
def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [60]:
gt_val = df_validation.to_dict(orient='records')

In [61]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [62]:
param_ranges = {
    "Name": (0.0, 3.0),
    "Description": (0.0, 3.0),
    "RecipeInstructions": (0.0, 3.0),
    "Calories": (0.0, 3.0),
    "FatContent": (0.0, 3.0),
    "SaturatedFatContent": (0.0, 3.0),
    "CholesterolContent": (0.0, 3.0),
    "SodiumContent": (0.0, 3.0),
    "CarbohydrateContent": (0.0, 3.0),
    "FiberContent": (0.0, 3.0),
    "SugarContent": (0.0, 3.0),
    "ProteinContent": (0.0, 3.0)
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [63]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'Name': 1.9453134835186232,
  'Description': 0.57263614744975,
  'RecipeInstructions': 2.2605129924168654,
  'Calories': 1.413872666770192,
  'FatContent': 1.2197707835582658,
  'SaturatedFatContent': 0.6749506444706871,
  'CholesterolContent': 0.7667530256711714,
  'SodiumContent': 1.4001091202989766,
  'CarbohydrateContent': 2.536721142018719,
  'FiberContent': 2.428219140020623,
  'SugarContent': 0.28023824634116423,
  'ProteinContent': 0.03414868138921001},
 0.6906111111111111)

In [64]:
def minsearch_improved(query):
    boost = {
        'Name': 1.94,
        'Description': 0.57,
        'RecipeInstructions': 2.26,
        'Calories': 1.41,
        'FatContent': 1.21,
        'SaturatedFatContent': 0.67,
        'CholesterolContent': 0.76,
        'SodiumContent': 1.40,
        'CarbohydrateContent': 2.53,
        'FiberContent': 2.42,
        'SugarContent': 0.28,
        'ProteinContent': 0.03
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/850 [00:00<?, ?it/s]

{'hit_rate': 0.8541176470588235, 'mrr': 0.709549953314659}

## RAG evaluation

In [65]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [66]:
len(ground_truth)

850

In [67]:
record = ground_truth[0]


In [68]:
print(record)

{'RecipeId': 38, 'question': 'What type of berries are needed for the Low-Fat Berry Blue Frozen Dessert?'}


In [69]:
prompt = prompt2_template.format(question=question, answer_llm=record)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What is the total preparation time for the Low-Fat Berry Blue Frozen Dessert before it is ready to serve?
Generated Answer: {'RecipeId': 38, 'question': 'What type of berries are needed for the Low-Fat Berry Blue Frozen Dessert?'}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [70]:
df_sample = df_question.sample(n=200, random_state=1)

In [71]:
sample = df_sample.to_dict(orient='records')

In [72]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [74]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['RecipeId'] = df_eval.record.apply(lambda d: d['RecipeId'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [94]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.835
PARTLY_RELEVANT    0.150
NON_RELEVANT       0.015
Name: proportion, dtype: float64

In [75]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

In [76]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,RecipeId,question,relevance,explanation
0,The provided context does not specify a cookin...,144,How long do I need to cook the egg noodles bef...,NON_RELEVANT,The generated answer does not provide any rele...
3,The provided context does not specify any inst...,211,How long should I chill the marshmallows and c...,NON_RELEVANT,The generated answer states that the context d...
10,The recipe context does not provide specific i...,39,How long do I need to cook the rice after addi...,NON_RELEVANT,The generated answer does not address the ques...
17,The provided context does not include any info...,83,How long should I fry the patties to achieve t...,NON_RELEVANT,The generated answer states that there is no i...
104,The context provided does not mention a recipe...,60,How long do I need to bake the crust layer bef...,NON_RELEVANT,The generated answer does not address the ques...
150,The context provided does not specify the temp...,89,What temperature should the icing mixture reac...,NON_RELEVANT,The generated answer fails to provide any info...
165,The context provided does not include instruct...,195,What should I do with the linguine once it is ...,NON_RELEVANT,The generated answer fails to provide any rele...
181,The context does not provide specific informat...,104,Is there a recommended way to store leftovers ...,NON_RELEVANT,The generated answer states that there is no s...
199,The provided context does not include a recipe...,59,What ingredients do I mix with the ricotta che...,NON_RELEVANT,The generated answer does not address the ques...


In [77]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [78]:
df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['RecipeId'] = df_eval.record.apply(lambda d: d['RecipeId'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [79]:
df_eval.relevance.value_counts()

relevance
RELEVANT           114
PARTLY_RELEVANT     12
NON_RELEVANT         6
Name: count, dtype: int64

In [80]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.863636
PARTLY_RELEVANT    0.090909
NON_RELEVANT       0.045455
Name: proportion, dtype: float64

In [81]:
df_eval.to_csv('../data/rag-eval-gpt-4o.csv', index=False)