In [27]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import os

In [15]:
df = pd.read_csv("../data/raw/recipes.csv")
df = df.set_index('RecipeId')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 522517 entries, 38 to 541383
Data columns (total 27 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Name                        522517 non-null  object 
 1   AuthorId                    522517 non-null  int64  
 2   AuthorName                  522517 non-null  object 
 3   CookTime                    439972 non-null  object 
 4   PrepTime                    522517 non-null  object 
 5   TotalTime                   522517 non-null  object 
 6   DatePublished               522517 non-null  object 
 7   Description                 522512 non-null  object 
 8   Images                      522516 non-null  object 
 9   RecipeCategory              521766 non-null  object 
 10  Keywords                    505280 non-null  object 
 11  RecipeIngredientQuantities  522514 non-null  object 
 12  RecipeIngredientParts       522517 non-null  object 
 13  AggregatedRating  

In [19]:
df.describe

<bound method NDFrame.describe of                                                       Name    AuthorId  \
RecipeId                                                                 
38                       Low-Fat Berry Blue Frozen Dessert        1533   
39                                                 Biryani        1567   
40                                           Best Lemonade        1566   
41                          Carina's Tofu-Vegetable Kebabs        1586   
42                                            Cabbage Soup        1538   
...                                                    ...         ...   
541379                      Meg's Fresh Ginger Gingerbread  2002090414   
541380    Roast Prime Rib au Poivre with Mixed Peppercorns      211566   
541381                               Kirshwasser Ice Cream  2001131545   
541382            Quick & Easy Asian Cucumber Salmon Rolls  2001004241   
541383                             Spicy Baked Scotch Eggs      188099   

   

In [20]:
df.isna().sum()

Name                               0
AuthorId                           0
AuthorName                         0
CookTime                       82545
PrepTime                           0
TotalTime                          0
DatePublished                      0
Description                        5
Images                             1
RecipeCategory                   751
Keywords                       17237
RecipeIngredientQuantities         3
RecipeIngredientParts              0
AggregatedRating              253223
ReviewCount                   247489
Calories                           0
FatContent                         0
SaturatedFatContent                0
CholesterolContent                 0
SodiumContent                      0
CarbohydrateContent                0
FiberContent                       0
SugarContent                       0
ProteinContent                     0
RecipeServings                182911
RecipeYield                   348071
RecipeInstructions                 0
d

In [22]:
df = df.dropna(subset=["Name", "RecipeIngredientParts", "RecipeInstructions"])


In [21]:
def clean_list_column(col):
    return (
        col.astype(str)
        .str.replace("c\\(", "", regex=True)
        .str.replace("\\)", "", regex=True)
        .str.replace('"', "", regex=False)
        .str.replace(",", " ", regex=False)
        .str.replace("'", "", regex=False)
        .str.lower()
    )

df["ingredients"] = clean_list_column(df["RecipeIngredientParts"])

In [23]:
df["steps"] = clean_list_column(df["RecipeInstructions"])
df["name"] = df["Name"].fillna("").str.lower()

In [24]:
# Combine fields for RAG
df["combined"] = df["name"] + " " + df["ingredients"] + " " + df["steps"]


In [25]:
# Select final columns
df_clean = df[["name", "ingredients", "steps", "combined"]].copy()


In [29]:
# Save cleaned dataset
CLEANED_PATH = "data/processed/recipes_cleaned.csv"
os.makedirs("data/processed", exist_ok=True)
df_clean.to_csv(CLEANED_PATH, index=False)

print("✅ Cleaned dataset saved to:", CLEANED_PATH)

✅ Cleaned dataset saved to: data/processed/recipes_cleaned.csv


In [None]:
# Step 3: Embed & index
print("🧠 Embedding...")
index, model, embeddings = build_index(df_clean)

In [None]:
# Step 4: Save artifacts
print("💾 Saving index and data...")
save_index(index, "models/faiss.index")
save_embeddings(embeddings, "models/embeddings.npy")
save_dataframe(df_clean, "data/processed/recipes_cleaned.csv")

print("✅ All done! You're ready to query FlavorBot.")