# Create Faiss DB for Moroccan Recipes and Meals : (Kaggle)

In [1]:
import pandas as pd
import numpy as np
from langchain_ollama import OllamaEmbeddings
from tqdm import tqdm
import time
import os
import pickle, faiss

## Read the Moroccan Recipes and Meals from CSV :

In [2]:
df = pd.read_csv('moroccan_recipes_dataset.csv')
print(df.shape)
df.head()

(4627, 2)


Unnamed: 0,prompt,completion
0,Recipe Name : Duck and prune tagine\n Descript...,['Pour the oil into a heavy-based pan or casse...
1,Recipe Name : Potato tagine with baked halloum...,['Heat the oven to 180°C fan/gas 6. Heat the o...
2,Recipe Name : Merguez sausages with crispy chi...,['Heat the oven to 180°C fan/gas 6. Put the ch...
3,Recipe Name : Chicken tagine casserole\n Descr...,['Heat the oven to 180ºC fan/gas 6. Slash the ...
4,Recipe Name : Preserved lemons\n Description :...,['Juice 4 of the lemons and set the juice asid...


### -> 'prompt' Column Contains : Recipe Name, Description, Ingredients and Steps for preparation

In [3]:
df['prompt'][6]

"Recipe Name : Clodagh McKenna’s Moroccan spiced lamb chops\n Description : Clodagh McKenna spices up lamb chops with a harissa-yogurt marinade in this quick and easy recipe. Serve with a herby bulgur wheat salad. Recipe from In Minutes by Clodagh McKenna (Kyle Books £20). For a Sunday roast with a twist, try our Moroccan spiced lamb shoulder with freekeh.\n Ingredients : ['4-6 British lamb chops, depending on size', '1 tbsp olive oil']\n Steps for preparation : nan\n "

## Transform Data

- remove duplicated rows
- remove duplicated Recipes Name

In [4]:
# remove duplicated data
df = df.drop_duplicates(subset=['prompt'])
df.shape

(4626, 2)

In [5]:
# remove duplicated Recipes Name
# Step 1: Extract the first line
df['first_line'] = df['prompt'].str.split('\n').str[0]

# Step 2: Extract the recipe name from the first line
df['recipe_name'] = df['first_line'].str.extract(r'Recipe Name\s*:\s*(.*)')

# Step 3: Remove all rows that have the same recipe name except the first one
df_unique = df.drop_duplicates(subset='recipe_name', keep='first')

# Optionally drop the helper column
df_unique = df_unique.drop(columns=['first_line'])

In [6]:
df_unique.shape

(3383, 3)

In [7]:
df_unique.isna().sum()

prompt         0
completion     8
recipe_name    1
dtype: int64

In [8]:
df_unique.head()

Unnamed: 0,prompt,completion,recipe_name
0,Recipe Name : Duck and prune tagine\n Descript...,['Pour the oil into a heavy-based pan or casse...,Duck and prune tagine
1,Recipe Name : Potato tagine with baked halloum...,['Heat the oven to 180°C fan/gas 6. Heat the o...,Potato tagine with baked halloumi
2,Recipe Name : Merguez sausages with crispy chi...,['Heat the oven to 180°C fan/gas 6. Put the ch...,Merguez sausages with crispy chickpeas and pre...
3,Recipe Name : Chicken tagine casserole\n Descr...,['Heat the oven to 180ºC fan/gas 6. Slash the ...,Chicken tagine casserole
4,Recipe Name : Preserved lemons\n Description :...,['Juice 4 of the lemons and set the juice asid...,Preserved lemons


In [9]:
df_unique.tail()

Unnamed: 0,prompt,completion,recipe_name
4579,Recipe Name : Moroccan Chicken Skillet\n Ingre...,"[""Using a meat mallet (or a rolling pin) pound...",Moroccan Chicken Skillet
4580,Recipe Name : Moroccan-Spiced Chicken\n Ingred...,"[""Position rack in center of oven and preheat ...",Moroccan-Spiced Chicken
4581,Recipe Name : Moroccan Chicken and Summer Squa...,"[""Heat grill to high."", ""Stir together cumin, ...",Moroccan Chicken and Summer Squash Salad
4582,Recipe Name : moroccan chicken kabob tapas Rec...,"[""Cut chicken into 1 inch cubes."", ""Put in all...",moroccan chicken kabob tapas Recipe
4583,Top 10 foods to try in Morocco : Harira - Tagi...,,


## Load Data to Faiss DB

In [10]:
db_path = 'Moroccan_Recipes_FaissDB'
index_path = os.path.join(db_path, f"moroccan_recipes.index")
meta_path = os.path.join(db_path, f"moroccan_recipes.pkl")
os.makedirs(db_path, exist_ok=True)

if os.path.exists(index_path):
    print("🔄 Loading FAISS index...")
    index = faiss.read_index(self.index_path)
else:
    print("⚙️ Creating new FAISS index...")
    index = faiss.IndexFlatL2(1024)

if os.path.exists(meta_path):
    print("📦 Loading metadata...")
    with open(meta_path, 'rb') as f:
        collection = pickle.load(f)
else:
    collection = []

⚙️ Creating new FAISS index...


In [7]:
embedding_model = OllamaEmbeddings(model="nomic-embed-text:latest")

In [8]:
def generate_embeddings(text: str):
    return embedding_model.embed_query(text)

In [9]:
print(len(generate_embeddings('qwertyu yui ytuyg uy ')))

768


In [10]:
print(generate_embeddings('qwertyu yui ytuyg uy ')[:20])

[-0.019315952, 0.02758621, -0.15841325, -0.056062303, 0.0438976, -0.015040412, 0.020199563, 0.001384781, -0.0090258345, -0.019812193, -0.017219989, 0.06601585, 0.041778587, 0.04475724, 0.008813767, 0.0033704017, 0.040916603, -0.05513884, -0.00344933, 0.040923666]


In [13]:
def load_to_faiss(row_text: str):
    """Load a document into ChromaDB with its embedding."""
    embedding = generate_embeddings(row_text)
    # metadata = document.get("metadata", {})
    # doc_id = metadata.get("row_id", "default_id")
    
    index.add(np.array([embedding]))
    collection.append(row_text)

def save_faiss():
    faiss.write_index(index, index_path)
    with open(meta_path, 'wb') as f:
        pickle.dump(collection, f)

In [14]:
for i in tqdm(range(df_unique.shape[0])) :
    row_text = f"{df_unique.iloc[i]['prompt']} \n\n-> Recipe Name : {df_unique.iloc[i]['recipe_name']}"
    
    load_to_faiss(row_text)
    
save_faiss()

100%|████████████████████████████████████████████████████████████████████████████| 3383/3383 [2:30:00<00:00,  2.66s/it]
