In [None]:
# Import libraries
import os
from tqdm import tqdm
import torch
import chromadb
from chromadb.config import Settings
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

os.path.join(os.getcwd(), "chroma_db")

In [None]:
# Load the original CSV file
df = pd.read_csv('recipes_ingredients.csv')
print(df.head())

In [None]:
# define function to generate single descriptive text for each recipe
def build_recipe_text(row):
    return f" Name: {row['title']} | Description: {row['description']} | Ingredients: {row['ingredients']}"

In [None]:
# Apply function to data
df['recipe_text'] = df.apply(build_recipe_text, axis=1)

In [None]:
# Load model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer("all-MiniLM-L6-v2").to(device)

# Initialize Chroma
chroma_client = chromadb.PersistentClient(path="chroma_db")
collection = chroma_client.get_or_create_collection(name="my_collection")

batch_size = 1024
embeddings = []

# Loop through all recipes in the batch
for i in tqdm(range(0, len(recipe_texts), batch_size), desc="Embedding and uploading"):
    batch_texts = recipe_texts[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]
    
    # Encode recipe_text
    batch_embeddings = model.encode(batch_texts, batch_size=batch_size, device="cuda", show_progress_bar=False)

    # Upload embedding to Chroma
    collection.upsert(
        documents=batch_texts,
        embeddings=batch_embeddings,
        ids=batch_ids
    )

In [None]:
# define function to query the chroma collection for testing
def query_chroma_collection(
    query_texts,
    collection_name="my_collection",
    chroma_path="chroma_db",
    n_results=20
):
    # Initialize persistent client
    chroma_client = chromadb.PersistentClient(path=chroma_path)

    # Get or create collection
    collection = chroma_client.get_or_create_collection(name=collection_name)

    # Query
    results = collection.query(
        query_texts=query_texts,
        n_results=n_results
    )
    return [int(_) for _ in results['ids'][0]]
    # return results


In [None]:
# Example for quering chroma collection and viewing results
results = query_chroma_collection(
    query_texts=["Suggest me a dish with pizza and macroni"],
    collection_name="my_collection",
    chroma_path="chroma_db",
    n_results=25
)

print(results)