<a href="https://colab.research.google.com/github/empresario-ai-tech/ai-experiments/blob/dec-1-explore-embeddings/Embeddings/embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -r requirements.txt

In [None]:
import os

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(BASE_DIR, 'data') 

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import faiss
import pickle
# from .utils import DATA_PATH

class IngredientEmbeddings:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.standardized_ingredients = self.load_standardized_ingredients()
        self.embeddings = self.generate_embeddings()
        self.index = self.build_faiss_index()

    def load_standardized_ingredients(self):
        df = pd.read_csv(f"{DATA_PATH}/standardized_ingredients.csv")
        return df['ingredient_name'].tolist()

    def generate_embeddings(self):
        embeddings = self.model.encode(self.standardized_ingredients, convert_to_tensor=True, show_progress_bar=True)
        return embeddings.cpu().numpy()

    def build_faiss_index(self):
        dimension = self.embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(self.embeddings)
        return index

    def save_embeddings(self, path='data/ingredient_embeddings.pkl'):
        with open(path, 'wb') as f:
            pickle.dump({
                'ingredients': self.standardized_ingredients,
                'embeddings': self.embeddings,
                'index': self.index
            }, f)

    def load_embeddings(self, path='data/ingredient_embeddings.pkl'):
        with open(path, 'rb') as f:
            data = pickle.load(f)
            self.standardized_ingredients = data['ingredients']
            self.embeddings = data['embeddings']
            self.index = data['index'] 

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

class IngredientMatcher:
    def __init__(self, threshold=0.7):
        self.threshold = threshold
        self.embeddings = IngredientEmbeddings()
        try:
            self.embeddings.load_embeddings()
        except FileNotFoundError:
            self.embeddings.generate_embeddings()
            self.embeddings.save_embeddings()
    
    def get_embedding(self, ingredient):
        model = self.embeddings.model
        return model.encode([ingredient], convert_to_tensor=True).cpu().numpy()

    def match_ingredient(self, ingredient):
        query_embedding = self.get_embedding(ingredient)
        distances, indices = self.embeddings.index.search(query_embedding, 1)
        closest_distance = distances[0][0]
        closest_index = indices[0][0]
        similarity = 1 / (1 + closest_distance)  # Convert L2 distance to similarity
        if similarity >= self.threshold:
            return self.embeddings.standardized_ingredients[closest_index], similarity
        else:
            return None, similarity

    def match_ingredients_list(self, ingredients):
        matched = {}
        for ingredient in ingredients:
            match, score = self.match_ingredient(ingredient)
            matched[ingredient] = {
                'matched_name': match,
                'similarity': score
            }
        return matched 

In [None]:
# from src.matcher import IngredientMatcher

def standardize_ingredients(user_ingredients, ai_ingredients):
    matcher = IngredientMatcher(threshold=0.7)
    
    print("Matching User Ingredients...")
    matched_user = matcher.match_ingredients_list(user_ingredients)
    
    print("Matching AI-Generated Ingredients...")
    matched_ai = matcher.match_ingredients_list(ai_ingredients)
    
    return matched_user, matched_ai


user_ingredients = [
    "all purpose flour",
    "gran sugar",
    "olive oil",
    "black pepper",
    "chicken breasts",
    "eggs",
    "milk",
    "butter"
]

ai_ingredients = [
    "all-purpose flour",
    "granulated sugar",
    "extra virgin olive oil",
    "black pepper",
    "chicken breast",
    "egg",
    "whole milk",
    "unsalted butter"
]

matched_user, matched_ai = standardize_ingredients(user_ingredients, ai_ingredients)

print("\nMatched User Ingredients:")
for k, v in matched_user.items():
    print(f"{k} -> {v['matched_name']} (Similarity: {v['similarity']:.2f})")

print("\nMatched AI-Generated Ingredients:")
for k, v in matched_ai.items():
    print(f"{k} -> {v['matched_name']} (Similarity: {v['similarity']:.2f})") 