In [None]:
# Source File: https://www.kaggle.com/datasets/shuyangli94/foodcom-recipes-with-search-terms-and-tags?resource=download
# Instead of 400K recipes, only 50K for the purpose of demonstrating our RAG model

In [None]:
!pip install -q pandas
!pip install -q numpy
!pip install faiss-cpu
!pip install -q sentence-transformers

[0m

In [None]:
import html
import unicodedata
import re
import pandas as pd
import numpy as np
import urllib.parse
import faiss
import json
import sqlite3
from typing import List, Dict, Tuple, Any, Set, Optional
from sentence_transformers import SentenceTransformer

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Path to the FAISS vector index file (stores vectors for search)
faiss_path = "/content/drive/MyDrive/Gen AI Project/recipe_faiss_index.index"

Mounted at /content/drive


In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 1. EDA

In [None]:
file_path = "/content/drive/MyDrive/Gen AI Project/50K recipes.csv"
#file_path = r"C:\Users\dolly\Desktop\Modules\Project Implementation\50K recipes.csv"

column_dtypes = {
    "id": str,
    "name": str,
    "description": str,
    "ingredients": str,
    "ingredients_raw_str": str,
    "serving_size": str,
    "servings": "Int64",
    "steps": str,
    "tags": str,
    "search_terms": str
}

df = pd.read_csv(file_path, dtype=column_dtypes)
df.head()


Unnamed: 0,id,name,description,ingredients,ingredients_raw_str,serving_size,servings,steps,tags,search_terms
0,232037,Simple Shrimp and Andouille Jambalaya,"Simple, easy and very tasty for when you are i...","['onion', 'red bell pepper', 'garlic cloves', ...","[""1 medium onion, chopped coarse "",""1 m...",1 (366 g),4,"['In a food processor, pulse the onion, red pe...","['60-minutes-or-less', 'time-to-make', 'course...","{'dinner', 'shrimp'}"
1,41090,black-and-white bean salad,,"['white beans', 'canned black beans', 'tomatoe...","[""1 cup canned white beans, rinsed and dra...",1 (807 g),1,"['In a large bowl, combine beans, tomato, onio...","['15-minutes-or-less', 'time-to-make', 'course...","{'vegetarian', 'salad', 'side', 'dinner', 'veg..."
2,60656,Crock Pot Italian Zucchini,This is a good recipe for weight watchers. It ...,"['zucchini', 'yellow squash', 'diced tomatoes'...","[""2 zucchini, sliced "",""2 small yel...",1 (244 g),4,['Put all ingredients in the crock pot and coo...,"['weeknight', 'time-to-make', 'course', 'main-...","{'side', 'vegetarian', 'italian'}"
3,232047,Beef Stew With Dried Cherries,This is a fabulous stew that came from one of ...,"['beef stew meat', 'flour', 'salt', 'allspice'...","[""3 lbs beef stew meat"",""3 tablespoons ...",1 (358 g),8,"['Preheat oven to 350°F.', ""Cut beef into 1 in...","['time-to-make', 'course', 'main-ingredient', ...",{'dinner'}
4,232050,Hot Sweet Almond Brittle,This is one of our standard holiday gift recip...,"['slivered almonds', 'cider vinegar', 'sugar',...","[""12 ounces slivered almonds"",""1/4 cup ...",1 (832 g),1,['Preheat oven to 375°F Place almonds in sing...,"['time-to-make', 'course', 'preparation', 'des...",{'dessert'}


In [None]:
data_summary = pd.DataFrame({
    "Missing Values": df.isnull().sum(),
    "Remaining Count": df.notnull().sum()
})

data_summary["% Missing"] = (data_summary["Missing Values"] / len(df)) * 100
data_summary


Unnamed: 0,Missing Values,Remaining Count,% Missing
id,0,50000,0.0
name,0,50000,0.0
description,1789,48211,3.578
ingredients,0,50000,0.0
ingredients_raw_str,0,50000,0.0
serving_size,0,50000,0.0
servings,0,50000,0.0
steps,0,50000,0.0
tags,0,50000,0.0
search_terms,0,50000,0.0


In [None]:
# Check for duplicate rows
df.duplicated().sum()


np.int64(0)

## 2. Preprocess Dataset

In [None]:
# Create copy for raw dataset
df_preprocessed = df.copy()
df_preprocessed.drop(columns=["description"], inplace=True)
df_preprocessed.head()

Unnamed: 0,id,name,ingredients,ingredients_raw_str,serving_size,servings,steps,tags,search_terms
0,232037,Simple Shrimp and Andouille Jambalaya,"['onion', 'red bell pepper', 'garlic cloves', ...","[""1 medium onion, chopped coarse "",""1 m...",1 (366 g),4,"['In a food processor, pulse the onion, red pe...","['60-minutes-or-less', 'time-to-make', 'course...","{'dinner', 'shrimp'}"
1,41090,black-and-white bean salad,"['white beans', 'canned black beans', 'tomatoe...","[""1 cup canned white beans, rinsed and dra...",1 (807 g),1,"['In a large bowl, combine beans, tomato, onio...","['15-minutes-or-less', 'time-to-make', 'course...","{'vegetarian', 'salad', 'side', 'dinner', 'veg..."
2,60656,Crock Pot Italian Zucchini,"['zucchini', 'yellow squash', 'diced tomatoes'...","[""2 zucchini, sliced "",""2 small yel...",1 (244 g),4,['Put all ingredients in the crock pot and coo...,"['weeknight', 'time-to-make', 'course', 'main-...","{'side', 'vegetarian', 'italian'}"
3,232047,Beef Stew With Dried Cherries,"['beef stew meat', 'flour', 'salt', 'allspice'...","[""3 lbs beef stew meat"",""3 tablespoons ...",1 (358 g),8,"['Preheat oven to 350°F.', ""Cut beef into 1 in...","['time-to-make', 'course', 'main-ingredient', ...",{'dinner'}
4,232050,Hot Sweet Almond Brittle,"['slivered almonds', 'cider vinegar', 'sugar',...","[""12 ounces slivered almonds"",""1/4 cup ...",1 (832 g),1,['Preheat oven to 375°F Place almonds in sing...,"['time-to-make', 'course', 'preparation', 'des...",{'dessert'}


In [None]:
# Clean ingredients column: remove single quotes and square brackets

def parse_column(value, newline=False):
    if pd.isnull(value):
        return ""

    # If it's a list or set, join items
    if isinstance(value, (list, set)):
        items = [str(i).strip() for i in value]
    elif isinstance(value, str) and value.startswith("[") and value.endswith("]"):
        try:
            items = value.strip("[]").split("','")
            items = [item.strip(" '\"") for item in items]
        except:
            items = [value]
    else:
        items = [str(value)]

    return "\n".join(items) if newline else ", ".join(items)

# Clean ingredients_raw_str: newline format + replace double quotes with single quotes
df_preprocessed["ingredients_raw_str"] = (
    df_preprocessed["ingredients_raw_str"]
    .apply(lambda x: parse_column(x, newline=True))
    .str.replace('"', "'", regex=False)  # Replace double quotes with single
    .str.replace(r"',\s*'", ", ", regex=True)  # Replace ',' pattern between quotes
    .str.replace(r"'(?!s\b)", "", regex=True)  # removes all ' except those in 's

    .str.strip()
)

# Clean tags
df_preprocessed["ingredients"] = (
    df_preprocessed["ingredients"]
    .apply(lambda x: parse_column(x))
    .str.replace('"', "", regex=False)
    .str.replace("[", "", regex=False)
    .str.replace("]", "", regex=False)
    .str.replace(r"',\s*'", ", ", regex=True)  # Optional, in case it's present
    .str.strip()
)

# Clean steps
df_preprocessed["steps"] = (
    df_preprocessed["steps"]
    .apply(lambda x: parse_column(x, newline=True))
    .str.replace('"', "'", regex=False)
    .str.replace(r"',\s*'", ", ", regex=True)  # Replace ',' pattern between quotes
    .str.strip()
)

# Clean tags
df_preprocessed["tags"] = (
    df_preprocessed["tags"]
    .apply(lambda x: parse_column(x))
    .str.replace('"', "", regex=False)
    .str.replace("[", "", regex=False)
    .str.replace("]", "", regex=False)
    .str.replace(r"',\s*'", ", ", regex=True)  # Optional, in case it's present
    .str.strip()
)

df_preprocessed["search_terms"] = (
    df_preprocessed["search_terms"]
    .apply(lambda x: parse_column(x))  # Flatten if stored as list/dict-like string
    .str.replace("{", "", regex=False)
    .str.replace("}", "", regex=False)
    .str.replace("'", " ", regex=False)
    .str.replace(r"\s*,\s*", ",", regex=True)  # Optional: ensure consistent commas
    .str.strip()
)

# Normalize comma spacing across relevant columns
columns_to_fix_commas = ["ingredients", "ingredients_raw_str", "steps", "tags", "search_terms"]

for col in columns_to_fix_commas:
    df_preprocessed[col] = (
        df_preprocessed[col]
        .str.replace(r"\s*,\s*", ", ", regex=True)  # Ensure 1 space after each comma
        .str.replace(r"\s{2,}", " ", regex=True)     # Collapse double spaces
        .str.strip()
    )

# Lowercase all string (object) columns
for col in df_preprocessed.select_dtypes(include="object").columns:
    df_preprocessed[col] = df_preprocessed[col].str.lower().str.strip()

# Replace °f with degrees fahrenheit in all text columns
for col in df_preprocessed.select_dtypes(include="object").columns:
    df_preprocessed[col] = df_preprocessed[col].str.replace("°f", " degrees fahrenheit", regex=False)



df_preprocessed.head(2)

Unnamed: 0,id,name,ingredients,ingredients_raw_str,serving_size,servings,steps,tags,search_terms
0,232037,simple shrimp and andouille jambalaya,"onion, red bell pepper, garlic cloves, large s...","1 medium onion, chopped coarse, 1 medium red b...",1 (366 g),4,"in a food processor, pulse the onion, red pepp...","60-minutes-or-less, time-to-make, course, main...","dinner, shrimp"
1,41090,black-and-white bean salad,"white beans, canned black beans, tomatoes, oni...","1 cup canned white beans, rinsed and drained, ...",1 (807 g),1,"in a large bowl, combine beans, tomato, onion ...","15-minutes-or-less, time-to-make, course, main...","vegetarian, salad, side, dinner, vegan"


In [None]:
# Decode HTML entities and URL-encoded characters in all string columns

def decode_html_and_url(text):
    if pd.isnull(text):
        return ""
    text = urllib.parse.unquote(str(text))
    text = html.unescape(html.unescape(text))
    return text

for col in df_preprocessed.select_dtypes(include="object").columns:
    df_preprocessed[col] = df_preprocessed[col].apply(decode_html_and_url)


df_preprocessed.head(1).T

Unnamed: 0,0
id,232037
name,simple shrimp and andouille jambalaya
ingredients,"onion, red bell pepper, garlic cloves, large s..."
ingredients_raw_str,"1 medium onion, chopped coarse, 1 medium red b..."
serving_size,1 (366 g)
servings,4
steps,"in a food processor, pulse the onion, red pepp..."
tags,"60-minutes-or-less, time-to-make, course, main..."
search_terms,"dinner, shrimp"


In [None]:
# Check for extra whitespace
def has_extra_whitespace(text):
    return bool(re.search(r"\s{2,}|\t|\n", str(text)))  # 2+ spaces, tabs, or newlines

# Check across all object columns
for col in df_preprocessed.select_dtypes(include="object").columns:
    count = df_preprocessed[col].apply(has_extra_whitespace).sum()
    if count > 0:
        print(f"Column '{col}' has {count} rows with extra whitespace.")


Column 'name' has 533 rows with extra whitespace.
Column 'steps' has 22 rows with extra whitespace.


In [None]:
# Final cleanup: remove extra whitespace across all object columns
for col in df_preprocessed.select_dtypes(include="object").columns:
    df_preprocessed[col] = (
        df_preprocessed[col]
        .str.replace(r"\s{2,}", " ", regex=True)  # Collapse 2+ spaces into 1
        .str.replace(r"\t|\n", " ", regex=True)   # Remove tabs and newlines
        .str.strip()                              # Trim leading/trailing spaces
    )

# Optional: re-check to confirm whitespace is gone
for col in df_preprocessed.select_dtypes(include="object").columns:
    count = df_preprocessed[col].apply(has_extra_whitespace).sum()
    if count > 0:
        print(f"{col}: still has {count} rows with extra whitespace.")
    else:
        print(f"{col}: do not have extra spacing.")


id: do not have extra spacing.
name: do not have extra spacing.
ingredients: do not have extra spacing.
ingredients_raw_str: do not have extra spacing.
serving_size: do not have extra spacing.
steps: do not have extra spacing.
tags: do not have extra spacing.
search_terms: do not have extra spacing.


In [None]:
#Determine the tags

# First, explode the tags column into separate rows
tags_series = df_preprocessed['search_terms'].apply(lambda x: x.split(', ')).explode()

# Remove duplicates and get the count of each tag
tags_counts = tags_series.value_counts()

# Display the top 50 most frequent tags
top_50_tags = tags_counts.head(50)

# Display the result
top_50_tags

Unnamed: 0_level_0,count
search_terms,Unnamed: 1_level_1
dinner,22527
dessert,11922
low-sodium,9727
low-carb,9338
low-calorie,7962
healthy,7273
vegetarian,7100
side,5360
lunch,5280
low-fat,4991


## 3. Select Columns to Embed for Retrieval

In [None]:
# Combine relevant fields into a single text field for embedding

df_preprocessed["rag_text"] = (
    df_preprocessed["name"] + ". " +
    df_preprocessed["ingredients"] + ". " +
    df_preprocessed["ingredients_raw_str"] + ". " +
    "serving size: " + df_preprocessed["serving_size"] + ", servings: " + df_preprocessed["servings"].astype(str) + ". " +
    df_preprocessed["steps"] + ". " +
    df_preprocessed["tags"] + ". " +
    df_preprocessed["search_terms"]
)
pd.set_option("display.max_colwidth", None)
df_preprocessed[["id", "name", "rag_text"]].head(2)


Unnamed: 0,id,name,rag_text
0,232037,simple shrimp and andouille jambalaya,"simple shrimp and andouille jambalaya. onion, red bell pepper, garlic cloves, large shrimp, salt, hot pepper sauce, vegetable oil, andouille sausage, long grain rice, bay leaves, diced tomatoes, clam juice, fresh parsley. 1 medium onion, chopped coarse, 1 medium red bell pepper, chopped coarse, 5 medium garlic cloves, chopped coarse, 1 lb extra large shrimp, shelled and deveined, salt, hot pepper sauce, 1 tablespoon vegetable oil, 3/4 lb andouille sausage, halved lengthwise and then cut into 1/4 inch slices, 1 1/2 cups long grain rice, 4 bay leaves, 1 (14 ounce) can diced tomatoes, briefly drained, 2 (8 ounce) bottles clam juice, 1/4 cup fresh parsley, chopped. serving size: 1 (366 g), servings: 4. in a food processor, pulse the onion, red pepper and garlic until chopped very fine, ten to twelve 1-second pulses, scraping down the sides of the bowl as necessary; set aside., season the shrimp with salt and hot pepper sauce to taste., heat the oil in a large dutch oven over medium high heat until shimmering. add the shrimp in a single layer and cook, without stirring, for 30 seconds. using tongs, flip the shrimp and cook for another 30 seconds. transfer the shrimp to a medium bowl and set aside., add the sausage to the pan and cook, stirring occasionally, until lightly browned, about 3 minutes. using a slotted spoon, transfer the sausage to a second bowl., scrape the veggies from the food processor into the empty pot and cook, stirring frequently, until softened, about 3 minutes. stir in the rice and continue to stir until the grains are coated with the fat, about 1 minute., add the bay leaves, tomatoes, clam juice, and 1 cup water and bring to a boil., stir in the sausage, cover, and reduce the heat to low., cook until the rice is tender, 17 to 20 minutes., off the heat, stir in the shrimp and parsley with a fork (do not mush the rice), cover the pot, and allow to sit for 2 minutes or until the shrimp is heated through., discard the bay leaves and adjust the seasoning with salt and hot pepper sauce to taste. serve immediately.. 60-minutes-or-less, time-to-make, course, main-ingredient, preparation, main-dish, pork, seafood, easy, shrimp, one-dish-meal, meat, pork-sausage, shellfish. dinner, shrimp"
1,41090,black-and-white bean salad,"black-and-white bean salad. white beans, canned black beans, tomatoes, onion, celery, white wine vinegar, italian parsley, table salt, black pepper, olive oil. 1 cup canned white beans, rinsed and drained, 1 cup canned black beans, rinsed and drained, 1 large tomatoes, diced, 1 small onion, diced, 1 stalk celery, diced, 1 tablespoon white wine vinegar, 2 tablespoons italian parsley, minced, 1/8 teaspoon table salt, to taste, 1/8 teaspoon black pepper, to taste, 1 -2 teaspoon olive oil, to taste. serving size: 1 (807 g), servings: 1. in a large bowl, combine beans, tomato, onion and celery., gently stir in vinegar and sprinkle with parsley; season to taste and serve.. 15-minutes-or-less, time-to-make, course, main-ingredient, cuisine, preparation, occasion, north-american, salads, side-dishes, beans, american, easy, no-cook, potluck, dinner-party, picnic, spring, summer, vegan, vegetarian, dietary, seasonal, inexpensive, black-beans, to-go, number-of-servings, 3-steps-or-less, technique. vegetarian, salad, side, dinner, vegan"


## 4. Vectorization

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import pandas as pd
import torch
from tqdm.notebook import tqdm

# Check if we have GPU and what type
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU type: {torch.cuda.get_device_name(0)}")

# Load model and move to GPU if available
model = SentenceTransformer("all-MiniLM-L6-v2")
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))

# For A100, we can use larger batch sizes
batch_size = 512  # Much larger batch size for A100

# Generate embeddings with progress tracking
print(f"Generating embeddings for {len(df_preprocessed)} recipes...")
embeddings = []

for i in tqdm(range(0, len(df_preprocessed), batch_size)):
    # Get batch
    batch = df_preprocessed.iloc[i:i+batch_size]

    # Generate embeddings for batch (use GPU acceleration)
    batch_embeddings = model.encode(batch["rag_text"].tolist(),
                                    show_progress_bar=False,
                                    batch_size=32,  # Sub-batch for GPU memory
                                    convert_to_numpy=True)

    # Append to list
    embeddings.extend(batch_embeddings)

# Add embeddings to dataframe
df_preprocessed["embedding"] = embeddings

# Convert list of vectors to np array
embedding_matrix = np.vstack(df_preprocessed["embedding"].values)

# Create FAISS index - for A100, we can use GPU-accelerated index
dimension = embedding_matrix.shape[1]

# Create and train index
index = faiss.IndexFlatL2(dimension)  # Using L2 distance

# If you want even faster search (with slight accuracy loss):
# nlist = 100  # number of clusters
# quantizer = faiss.IndexFlatL2(dimension)
# index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
# index.train(embedding_matrix)

# Add vectors to index
index.add(embedding_matrix)

# Save to Drive
faiss.write_index(index, "/content/drive/MyDrive/Gen AI Project/recipe_faiss.index")
df_preprocessed.to_csv("/content/drive/MyDrive/Gen AI Project/recipe_metadata.csv", index=False)

print(f"Successfully created and saved index with {len(df_preprocessed)} recipes and dimension {dimension}")

GPU available: True
GPU type: NVIDIA A100-SXM4-40GB
Generating embeddings for 50000 recipes...


  0%|          | 0/98 [00:00<?, ?it/s]

Successfully created and saved index with 50000 recipes and dimension 384


## 5. Retrieval & Generation

### Retrieval Section
Your retrieval functionality implements a multi-layered search approach that effectively combines:

Semantic Search - Uses FAISS vector database to find recipes semantically similar to the user's query, leveraging embeddings created with SentenceTransformer.
Tag-Based Filtering - Pre-filters results based on dietary restrictions (gluten-free, low cholesterol, etc.) using the recipe tags from your dataset.
Ingredient Availability Matching - Prioritizes recipes that use ingredients the user already has (from SQL database) using Jaccard similarity.
Missing Ingredient Detection - Identifies which ingredients the user would need to purchase for each recipe recommendation.
Fallback Mechanism - When no good matches are found, the system can generate custom recipe suggestions based on available ingredients.

### Generation Section
Your generation functionality creates contextually appropriate responses by:

Structured Context Building - Formats retrieved recipes and their details into a structured prompt for the LLM.
Personalized Response Creation - Uses Llama 3.2 to generate natural, conversational responses that address the user's specific needs.
Dietary Awareness - Explicitly acknowledges dietary restrictions and explains how recommendations meet those requirements.
Missing Ingredient Communication - Clearly indicates which additional ingredients would be needed.
Helpful Additions - Includes relevant cooking tips and offers to provide more detailed information.

In [None]:

def get_available_ingredients() -> List[str]:
    """
    Retrieve available ingredients from the SQL database.
    Returns a list of ingredient names.
    """
    try:
        # Connect to your SQL database - adjust connection string as needed
        conn = sqlite3.connect('your_ingredients_database.db')
        cursor = conn.cursor()

        # Query to get available ingredients
        cursor.execute("SELECT ingredient_name FROM available_ingredients WHERE in_stock = 1")

        # Extract ingredient names and convert to lowercase for better matching
        available_ingredients = [row[0].lower().strip() for row in cursor.fetchall()]

        conn.close()
        return available_ingredients
    except Exception as e:
        print(f"Database error: {str(e)}")
        # Return a default set of common ingredients for fallback
        return ["salt", "pepper", "water", "oil", "garlic", "onion"]


In [None]:
# ------------- QUERY UNDERSTANDING -------------

def extract_dietary_restrictions(query: str) -> List[str]:
    """
    Extract dietary restrictions from the user query.
    Returns a list of identified restrictions.
    """
    # Common dietary restrictions to look for
    restrictions_mapping = {
        "vegetarian": ["vegetarian", "no meat"],
        "vegan": ["vegan", "plant-based", "no animal products"],
        "gluten free": ["gluten free", "gluten-free", "no gluten", "without gluten"],
        "dairy free": ["dairy free", "dairy-free", "no dairy", "lactose intolerant"],
        "low carb": ["low carb", "low-carb", "keto friendly"],
        "keto": ["keto", "ketogenic"],
        "low fat": ["low fat", "low-fat", "reduced fat"],
        "low sodium": ["low sodium", "low-sodium", "low salt"],
        "low cholesterol": ["low cholesterol", "cholesterol free", "heart healthy"],
        "nut free": ["nut free", "nut-free", "no nuts", "without nuts"],
        "egg free": ["egg free", "egg-free", "no eggs"],
        "soy free": ["soy free", "soy-free", "no soy"],
        "paleo": ["paleo", "paleolithic"],
        "pescatarian": ["pescatarian", "fish but no meat"]
    }

    found_restrictions = []
    query_lower = query.lower()

    for restriction, keywords in restrictions_mapping.items():
        if any(keyword in query_lower for keyword in keywords):
            found_restrictions.append(restriction)

    return found_restrictions

In [None]:
# ------------- RECIPE SEARCH & MATCHING -------------

def calculate_jaccard_similarity(recipe_ingredients: List[str], available_ingredients: List[str]) -> float:
    """
    Calculate Jaccard similarity between recipe ingredients and available ingredients.

    Jaccard similarity = (Ingredients in Recipe that are Available) / (Total Unique Ingredients)

    Returns a score between 0 and 1, with higher values indicating more ingredient overlap.
    """
    # Create sets for efficient intersection/union operations
    recipe_set = set()
    for recipe_item in recipe_ingredients:
        # Handle multi-word ingredients by breaking them down
        words = recipe_item.split()
        for word in words:
            if len(word) > 3:  # Skip small words like "of", "the", etc.
                recipe_set.add(word)

    available_set = set(available_ingredients)

    # Calculate intersection and union
    intersection = len(recipe_set.intersection(available_set))
    union = len(recipe_set.union(available_set))

    # Avoid division by zero
    if union == 0:
        return 0

    return intersection / union

def parse_ingredient_list(ingredients_str: str) -> List[str]:
    """
    Parse ingredient string into a list of individual ingredients.
    Handles various formats found in the dataset.
    """
    if not ingredients_str or pd.isna(ingredients_str):
        return []

    # Handle comma-separated lists
    if ',' in ingredients_str:
        ingredients = [ing.strip().lower() for ing in ingredients_str.split(',')]
    # Handle newline-separated lists
    elif '\n' in ingredients_str:
        ingredients = [ing.strip().lower() for ing in ingredients_str.split('\n')]
    # Single ingredient or other format
    else:
        ingredients = [ingredients_str.strip().lower()]

    return [ing for ing in ingredients if ing]  # Remove empty strings

def filter_by_tags(
    recipe_df: pd.DataFrame,
    dietary_restrictions: List[str]
) -> pd.DataFrame:
    """
    Filter recipes by tags to match dietary restrictions.
    Returns a filtered DataFrame.
    """
    if not dietary_restrictions:
        return recipe_df

    # Create mask starting with all True
    mask = pd.Series([True] * len(recipe_df))

    for restriction in dietary_restrictions:
        # For each restriction, update the mask
        restriction_mask = recipe_df['tags'].str.contains(restriction, case=False, na=False)

        # Special handling for some restrictions
        if restriction == "gluten free":
            # Either must have gluten-free tag or not contain gluten ingredients
            gluten_ingredients = ["wheat", "barley", "rye", "flour", "pasta", "bread", "couscous"]
            contains_gluten = recipe_df['ingredients'].apply(
                lambda x: any(g in str(x).lower() for g in gluten_ingredients)
            )
            restriction_mask = restriction_mask | ~contains_gluten

        elif restriction == "low cholesterol":
            # Either has low-cholesterol tag or doesn't have high cholesterol ingredients
            high_cholesterol = ["egg yolk", "liver", "organ meat", "shrimp", "butter", "cheese"]
            contains_high_cholesterol = recipe_df['ingredients'].apply(
                lambda x: any(h in str(x).lower() for h in high_cholesterol)
            )
            restriction_mask = restriction_mask | ~contains_high_cholesterol

        mask = mask & restriction_mask

    return recipe_df[mask]

def search_recipes(
    query: str,
    embedding_model: SentenceTransformer,
    faiss_index,
    recipe_df: pd.DataFrame,
    available_ingredients: List[str],
    dietary_restrictions: List[str],
    top_k: int = 10,  # Get more initial results for better filtering
    final_k: int = 5   # Final number to return
) -> List[Dict[str, Any]]:
    """
    Search for recipes based on user query, dietary restrictions, and available ingredients.
    Implements both semantic search and ingredient overlap scoring.

    Args:
        query: User's query
        embedding_model: Loaded sentence transformer model
        faiss_index: Loaded FAISS index
        recipe_df: DataFrame with recipe metadata
        available_ingredients: List of available ingredients
        dietary_restrictions: List of dietary restrictions
        top_k: Number of initial recipes to retrieve
        final_k: Number of final recipes to return

    Returns:
        List of recipe dictionaries with match scores and missing ingredients
    """
    # Step 1: Pre-filter by tags if dietary restrictions are present
    filtered_df = filter_by_tags(recipe_df, dietary_restrictions)

    # If we have filtered out all recipes, fall back to the original dataframe
    if len(filtered_df) == 0:
        print("Warning: Tag filtering removed all recipes. Falling back to full dataset.")
        filtered_df = recipe_df

    # Step 2: Enhance query with dietary restrictions for better embedding
    enhanced_query = query
    for restriction in dietary_restrictions:
        if restriction not in query.lower():
            enhanced_query += f" {restriction}"

    # Step 3: Create embedding for the query
    query_embedding = embedding_model.encode([enhanced_query])[0].reshape(1, -1).astype('float32')

    # Step 4: Get top-k initial matches by semantic similarity
    distances, indices = faiss_index.search(query_embedding, min(top_k, len(filtered_df)))

    # Step 5: Compute combined scores and identify missing ingredients
    results = []
    for i, idx in enumerate(indices[0]):
        # Get original index from filtered dataframe
        if idx >= len(filtered_df):
            continue  # Skip if index out of bounds

        recipe = filtered_df.iloc[idx].to_dict()

        # Parse ingredients to check availability
        recipe_ingredients = parse_ingredient_list(recipe['ingredients'])

        # Calculate Jaccard similarity for ingredient overlap
        jaccard_score = calculate_jaccard_similarity(recipe_ingredients, available_ingredients)

        # Check which ingredients are missing
        missing_ingredients = []
        for ingredient in recipe_ingredients:
            # Check if any available ingredient is part of this ingredient
            if not any(avail_ing in ingredient for avail_ing in available_ingredients):
                missing_ingredients.append(ingredient)

        # Calculate semantic similarity score (convert distance to similarity)
        semantic_score = float(1 - distances[0][i])

        # Combine scores (60% semantic, 40% ingredient overlap)
        combined_score = (semantic_score * 0.6) + (jaccard_score * 0.4)

        results.append({
            'id': recipe['id'],
            'name': recipe['name'],
            'ingredients': recipe_ingredients,
            'missing_ingredients': missing_ingredients,
            'steps': recipe['steps'],
            'tags': recipe['tags'],
            'semantic_score': semantic_score,
            'ingredient_overlap_score': jaccard_score,
            'combined_score': combined_score,
            'serving_size': recipe['serving_size'],
            'servings': recipe['servings']
        })

    # Sort by combined score and return top final_k
    results.sort(key=lambda x: x['combined_score'], reverse=True)
    return results[:final_k]


In [None]:
# ------------- LLM RESPONSE GENERATION -------------

def generate_recipe_fallback(
    query: str,
    dietary_restrictions: List[str],
    available_ingredients: List[str],
    llm_model: Any
) -> Dict[str, Any]:
    """
    Generate a fallback recipe suggestion when no good matches are found.

    Args:
        query: Original user query
        dietary_restrictions: List of identified dietary restrictions
        available_ingredients: List of available ingredients
        llm_model: Loaded Llama 3.2 model

    Returns:
        Dictionary with fallback recipe information
    """
    # Create prompt for the LLM to generate a recipe
    ingredients_list = ", ".join(available_ingredients[:15])  # Limit to first 15 ingredients
    restrictions_list = ", ".join(dietary_restrictions)

    prompt = f"""
    Create a simple recipe that meets these requirements:

    Available ingredients: {ingredients_list}
    Dietary restrictions: {restrictions_list}
    User request: {query}

    Return ONLY a JSON object with this exact format:
    {{
      "name": "Recipe name",
      "ingredients": ["ingredient1", "ingredient2", ...],
      "steps": "Step-by-step instructions",
      "tags": "comma,separated,tags",
      "notes": "Any special notes about dietary restrictions"
    }}

    The recipe should use as many of the available ingredients as possible while respecting all dietary restrictions.
    """

    # Generate response with Llama 3.2
    try:
        response = llm_model.create_completion(
            prompt,
            max_tokens=1024,
            temperature=0.7,
            top_p=0.9,
            stop=["```", "User:"]
        )

        # Parse JSON response
        response_text = response['choices'][0]['text'].strip()
        # Extract JSON if embedded in text
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "{" in response_text and "}" in response_text:
            json_str = response_text[response_text.find("{"):response_text.rfind("}")+1]
        else:
            json_str = response_text

        fallback_recipe = json.loads(json_str)

        # Add fallback flag
        fallback_recipe["is_fallback"] = True
        fallback_recipe["combined_score"] = 0.5  # Arbitrary middle score

        return fallback_recipe
    except Exception as e:
        # If JSON parsing fails, return a basic fallback
        print(f"Error generating fallback recipe: {str(e)}")
        return {
            "name": "Simple " + " ".join(dietary_restrictions) + " Recipe",
            "ingredients": available_ingredients[:5],
            "steps": "Combine available ingredients to taste.",
            "tags": ",".join(dietary_restrictions),
            "notes": "This is a basic recipe using your available ingredients.",
            "is_fallback": True,
            "combined_score": 0.5
        }

def generate_user_friendly_response(
    query: str,
    recipes: List[Dict[str, Any]],
    dietary_restrictions: List[str],
    llm_model: Any
) -> str:
    """
    Generate a user-friendly, conversational response about recipe recommendations.

    Args:
        query: Original user query
        recipes: List of recipe dictionaries from search_recipes
        dietary_restrictions: List of identified dietary restrictions
        llm_model: Loaded Llama 3.2 model

    Returns:
        Natural language response
    """
    # Handle empty results with fallback messaging
    if not recipes:
        prompt = f"""
        User Query: {query}

        I couldn't find any recipes that match your dietary needs ({', '.join(dietary_restrictions) if dietary_restrictions else "no restrictions"}).

        Please write a friendly, helpful response that:
        1. Acknowledges that no perfect matches were found
        2. Suggests they try a different set of restrictions or ingredients
        3. Offers to help them find alternatives
        4. Maintains a conversational, supportive tone

        Your response:
        """
    else:
        # Check if we're using a fallback recipe
        has_fallback = any(recipe.get('is_fallback', False) for recipe in recipes)

        # Format recipe information
        recipe_info = []
        for i, recipe in enumerate(recipes):
            # Format missing ingredients
            if recipe.get('missing_ingredients'):
                missing = f"Missing ingredients: {', '.join(recipe.get('missing_ingredients', []))}"
            else:
                missing = "All ingredients available!"

            # Format steps more cleanly
            steps = recipe.get('steps', '')
            if steps:
                # Try to format as numbered list if separated by newlines
                if '\n' in steps:
                    step_items = steps.split('\n')
                    steps_formatted = '\n'.join([f"{idx+1}. {step.strip()}" for idx, step in enumerate(step_items)])
                else:
                    steps_formatted = steps
            else:
                steps_formatted = "No steps available."

            recipe_text = f"""
            Recipe {i+1}: {recipe.get('name', 'Unnamed Recipe')}
            {'[AI GENERATED RECIPE]' if recipe.get('is_fallback', False) else ''}

            Ingredients: {', '.join(recipe.get('ingredients', []))}
            {missing}

            Steps: {steps_formatted}

            Tags: {recipe.get('tags', 'No tags')}
            """
            recipe_info.append(recipe_text)

        all_recipes = "\n".join(recipe_info)

        prompt = f"""
        User Query: "{query}"

        Dietary Restrictions: {', '.join(dietary_restrictions) if dietary_restrictions else "None specified"}

        Found Recipes:
        {all_recipes}

        Please write a friendly, conversational response that:
        1. Acknowledges the user's request and dietary needs in a natural way
        2. Recommends the best recipe(s) from the list with a brief explanation why
        3. Mentions any missing ingredients they would need to get
        4. Adds a helpful cooking tip relevant to the top recipe
        5. Asks if they'd like more details about any recipe
        {f"6. If recommending the AI-generated recipe, briefly mention that it's a custom suggestion based on their available ingredients" if has_fallback else ""}

        Keep the tone warm and helpful, like a knowledgeable friend who loves cooking. Avoid being too formal or robotic.

        Your response:
        """

    # Generate response with Llama 3.2
    try:
        response = llm_model.create_completion(
            prompt,
            max_tokens=1024,
            temperature=0.7,
            top_p=0.9,
            stop=["User:", "\n\n\n"]
        )

        return response['choices'][0]['text'].strip()
    except Exception as e:
        # Fallback response if LLM fails
        print(f"Error generating user-friendly response: {str(e)}")

        if not recipes:
            return "I couldn't find any recipes that match your requirements. Would you like to try with different ingredients or dietary preferences?"
        else:
            top_recipe = recipes[0]
            return f"I recommend trying {top_recipe.get('name', 'this recipe')}. You'll need to get {', '.join(top_recipe.get('missing_ingredients', []))} if you don't have them already."


In [None]:
from llama_cpp import Llama
import json

def process_recipe_query(query: str) -> str:
    """
    Main function to process a user query about recipes.

    Args:
        query: User's natural language query

    Returns:
        Natural language response with recipe recommendations
    """
    try:
        # Load models and data
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

        # Updated Llama model loading
        try:
            llm_model = Llama(
                model_path="/path/to/your/llama-3.2-8b.gguf",  # Absolute path to model
                n_ctx=4096,  # Context window
                n_gpu_layers=-1  # Use all available GPU layers if possible
            )
        except Exception as model_load_error:
            print(f"Error loading LLM model: {model_load_error}")
            llm_model = None

        # Load recipe data and FAISS index
        try:
            recipe_df = pd.read_csv("/absolute/path/to/recipe_metadata.csv")
            faiss_index = faiss.read_index("/absolute/path/to/recipe_faiss.index")
        except Exception as data_load_error:
            print(f"Error loading recipe data: {data_load_error}")
            return "Sorry, I'm having trouble loading the recipe database."

        # Get available ingredients from database
        available_ingredients = get_available_ingredients()

        # Extract dietary restrictions from query
        dietary_restrictions = extract_dietary_restrictions(query)

        # Search for matching recipes
        try:
            recipes = search_recipes(
                query=query,
                embedding_model=embedding_model,
                faiss_index=faiss_index,
                recipe_df=recipe_df,
                available_ingredients=available_ingredients,
                dietary_restrictions=dietary_restrictions
            )
        except Exception as search_error:
            print(f"Error in recipe search: {search_error}")
            recipes = []

        # If no good matches are found or all need many ingredients, generate a fallback
        if not recipes or all(len(recipe.get('missing_ingredients', [])) > 3 for recipe in recipes):
            # Only attempt fallback if LLM is available
            if llm_model:
                try:
                    fallback_recipe = generate_recipe_fallback(
                        query=query,
                        dietary_restrictions=dietary_restrictions,
                        available_ingredients=available_ingredients,
                        llm_model=llm_model
                    )

                    # Add fallback to results if it's valid
                    if fallback_recipe and 'name' in fallback_recipe:
                        recipes.append(fallback_recipe)
                except Exception as fallback_error:
                    print(f"Error generating fallback recipe: {fallback_error}")

        # Generate user-friendly response
        try:
            # Only attempt if LLM is available
            if llm_model:
                response = generate_user_friendly_response(
                    query=query,
                    recipes=recipes,
                    dietary_restrictions=dietary_restrictions,
                    llm_model=llm_model
                )
            else:
                # Fallback response without LLM
                if recipes:
                    top_recipe = recipes[0]
                    response = f"I recommend {top_recipe.get('name', 'this recipe')}. " \
                               f"You'll need these ingredients: {', '.join(top_recipe.get('ingredients', []))}"
                else:
                    response = "I couldn't find any recipes matching your request. Could you try a different query?"
        except Exception as response_error:
            print(f"Error generating response: {response_error}")
            response = "I'm having trouble generating a response right now."

        return response

    except Exception as e:
        # Global error handler
        error_msg = f"An unexpected error occurred: {str(e)}"
        print(error_msg)
        return "I'm sorry, I encountered an unexpected error. Please try again."

# ------------- EXAMPLE USAGE -------------

if __name__ == "__main__":
    # Example query
    user_query = "What can I cook if I want low in cholesterol and gluten free?"

    # Process query and get response
    response = process_recipe_query(user_query)
    print(response)

An error occurred: name 'Llama' is not defined
I'm sorry, I encountered an error while searching for recipes. Please try again with a different query or check if the recipe database is properly loaded.
