In [1]:
import pandas as pd
from tqdm.auto import tqdm
import json
import random

## Ingestion

In [2]:
df = pd.read_csv('data.csv')
documents = df.to_dict(orient='records')

In [3]:
import minsearch

In [5]:
index = minsearch.Index(
    text_fields=['recipe_name', 'meal_type', 'dietary_category', 'ingredients',
       'equipment_needed', 'nutritional_information', 'instructions', 'allergens', 'prep_time_minutes'],
    keyword_fields=['id']
)

In [6]:
# Ensure all fields in text_fields are strings
for doc in documents:
    for field in ['recipe_name', 'meal_type', 'dietary_category', 'ingredients',
                  'equipment_needed', 'nutritional_information', 'instructions', 'allergens', 'prep_time_minutes']:
        doc[field] = str(doc[field]) if doc[field] is not None else ""

In [7]:
index.fit(documents)

<minsearch.minsearch.Index at 0x1488243d0>

## Ollama Setup & LLM Function

In [8]:
import requests
import json

def ollama_generate(model, prompt, temperature=0.0):
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "temperature": temperature,
        "options": {
            "num_ctx": 8192
        }
    }
    
    response = requests.post(url, json=payload)
    if response.status_code != 200:
        raise Exception(f"Ollama error: {response.text}")
    
    return response.json()['response']

## RAG Flow (Now using Ollama)

In [9]:
def search(query):
    boost = {
        'recipe_name': 3.0,
        'meal_type': 1.5,
        'dietary_category': 1.0,
        'ingredients': 3.5,
        'equipment_needed': 1.0,
        'nutritional_information': 2.0,
        'instructions': 1.0,
        'allergens': 1.5,
        'prep_time_minutes': 0.5
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

In [10]:
prompt_template = """
You're a nutrition assistant named BiteBlender. Answer the QUESTION based on the CONTEXT from our nutrition database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
recipe_name: {recipe_name}
meal_type: {meal_type}
dietary_category: {dietary_category}
ingredients: {ingredients}
equipment_needed: {equipment_needed}
nutritional_information: {nutritional_information}
instructions: {instructions}
allergens: {allergens}
prep_time_minutes: {prep_time_minutes}
""".strip()

def build_prompt(query, search_results):
    context = ""
    for doc in search_results:
        context += entry_template.format(**doc) + "\n\n"
    
    return prompt_template.format(question=query, context=context).strip()

In [11]:
def llm(prompt, model="qwen2:1.5b"):
    return ollama_generate(model=model, prompt=prompt)

In [12]:
def rag(query, model="qwen2:1.5b"):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

## Test Both Models

In [13]:
question = "What ingredients are needed for lentil soup?"

print("Qwen2:1.5b:")
print(rag(question, model="qwen2:1.5b"))

print("\n" + "="*60 + "\n")

print("Llama3.2:3b:")
print(rag(question, model="llama3.2:3b"))

Qwen2:1.5b:
The ingredients needed for Lentil Soup are as follows:

- Cucumber
- Mushroom
- Quinoa (if not using gluten-free options)
- Salmon (or chicken)
- Avocado
- Bell Pepper (optional)
- Spinach or kale (for garnish if serving it on the side)
- Onion (if desired for flavor, not necessary for soup itself)
- Tomatoes (if you want to add acidity and tomato-based taste, optional but recommended)
- Grated cheese (for adding creaminess, not essential for soup alone)

Please note that the ingredients may vary based on personal preferences or specific diet requirements.


Llama3.2:3b:
Based on the provided CONTEXT, I can see that there are two recipes for Lentil Soup.

The recipe "Lentil Soup" (meal_type: Breakfast, dietary_category: Gluten-Free) has the following ingredients:

1. quinoa
2. mushroom
3. chicken

These are the ingredients needed for this specific recipe.


## Retrieval Evaluation (Same as before)

In [14]:
df_question = pd.read_csv('ground-truth-retrieval.csv')
ground_truth = df_question.to_dict(orient='records')

def hit_rate(relevance_total):
    return sum(any(line) for line in relevance_total) / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank, val in enumerate(line):
            if val:
                total_score += 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {'hit_rate': hit_rate(relevance_total), 'mrr': mrr(relevance_total)}

In [15]:
evaluate(ground_truth, lambda q: search(q['question']))

  0%|          | 0/818 [00:00<?, ?it/s]

{'hit_rate': 0.26772616136919314, 'mrr': 0.09609917724220919}

## RAG Evaluation with Both Models

In [16]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

def evaluate_rag(model_name, n_samples=200):
    sample = df_question.sample(n=n_samples, random_state=1).to_dict(orient='records')
    evaluations = []

    for record in tqdm(sample, desc=f"Evaluating {model_name}"):
        q = record['question']
        answer_llm = rag(q, model=model_name)

        eval_prompt = prompt2_template.format(question=q, answer_llm=answer_llm)
        evaluation = ollama_generate(model="llama3.2:3b", prompt=eval_prompt)
        
        try:
            evaluation_json = json.loads(evaluation)
        except:
            evaluation_json = {"Relevance": "NON_RELEVANT", "Explanation": "Failed to parse evaluator response"}
        
        evaluations.append((record, answer_llm, evaluation_json))

    df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])
    df_eval['id'] = df_eval.record.apply(lambda x: x['id'])
    df_eval['question'] = df_eval.record.apply(lambda x: x['question'])
    df_eval['relevance'] = df_eval.evaluation.apply(lambda x: x.get('Relevance', 'NON_RELEVANT'))
    df_eval['explanation'] = df_eval.evaluation.apply(lambda x: x.get('Explanation', ''))
    
    del df_eval['record']
    del df_eval['evaluation']
    
    print(f"\n=== Results for {model_name} ===")
    print(df_eval.relevance.value_counts(normalize=True))
    
    df_eval.to_csv(f'../data/rag-eval-{model_name.replace(":", "-")}.csv', index=False)
    return df_eval

In [17]:
# Run evaluation for both models
eval_qwen = evaluate_rag("qwen2:1.5b", n_samples=150)
eval_llama = evaluate_rag("llama3.2:3b", n_samples=150)

Evaluating qwen2:1.5b:   0%|          | 0/150 [00:00<?, ?it/s]


=== Results for qwen2:1.5b ===
relevance
NON_RELEVANT       0.926667
PARTLY_RELEVANT    0.053333
RELEVANT           0.020000
Name: proportion, dtype: float64


Evaluating llama3.2:3b:   0%|          | 0/150 [00:00<?, ?it/s]


=== Results for llama3.2:3b ===
relevance
NON_RELEVANT       0.933333
PARTLY_RELEVANT    0.046667
RELEVANT           0.020000
Name: proportion, dtype: float64
