<a href="https://colab.research.google.com/github/empresario-ai-tech/ai-experiments/blob/dec-1-explore-embeddings/Embeddings/embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -r '/content/drive/MyDrive/Colab Notebooks/Embeddings/requirements.txt'

In [None]:
!pip install -U sentence-transformers

In [None]:
!pip uninstall -y tensorflow

In [None]:
!pip install numpy==1.21.6

In [None]:
!pip install matplotlib thinc gensim

In [None]:
!pip install tensorflow-cpu==2.11.0

In [None]:
!pip install transformers --upgrade

In [None]:
!pip uninstall -y torch torchvision torchaudio torch_xla

In [None]:
# Install PyTorch, TorchVision, and TorchAudio
!pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 -f https://download.pytorch.org/whl/torch_stable.html

# Install torch_xla
!pip install torch_xla==2.0.1 -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-2.0-cp310-cp310-linux_x86_64.whl

In [4]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import faiss
import pickle
import torch
try:
    import torch_xla.core.xla_model as xm
except ImportError:
    xm = None
import os
from tqdm import tqdm

# Configuration
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/Embeddings"
EMBEDDINGS_PATH = '/content/ingredient_embeddings.pkl'
STANDARDIZED_INGREDIENTS_FILE = f'{DATA_PATH}/standardized_ingredients_dummy.csv'
MODEL_NAME = 'all-MiniLM-L6-v2'
BATCH_SIZE = 1000
THRESHOLD = 0.7

USER_INGREDIENTS = [
    "all purpose flour",
    "gran sugar",
    "olive oil",
    "black pepper",
    "chicken breasts",
    "eggs",
    "milk",
    "butter"
]

AI_INGREDIENTS = [
    "all-purpose flour",
    "granulated sugar",
    "extra virgin olive oil",
    "black pepper",
    "chicken breast",
    "egg",
    "whole milk",
    "unsalted butter"
]


In [5]:
# Define IngredientEmbeddings class
class IngredientEmbeddings:
    def __init__(self, model_name=MODEL_NAME, batch_size=BATCH_SIZE):
        self.device = self.get_device()
        self.model = SentenceTransformer(model_name).to(self.device)
        self.batch_size = batch_size
        self.standardized_ingredients = None
        self.embeddings = None
        self.index = None

    def get_device(self):
        if torch.cuda.is_available():
            return 'cuda'
        else:
            return 'cpu'

    def load_standardized_ingredients(self):
        return pd.read_csv(STANDARDIZED_INGREDIENTS_FILE)

    def generate_embeddings(self):
        all_embeddings = []
        for chunk in tqdm(pd.read_csv(STANDARDIZED_INGREDIENTS_FILE, chunksize=self.batch_size), desc="Generating Embeddings"):
            chunk_embeddings = self.model.encode(
                chunk['ingredient_name'].tolist(),
                convert_to_tensor=True,
                show_progress_bar=False,
                device=self.device
            )
            all_embeddings.append(chunk_embeddings.cpu().numpy())
        return np.vstack(all_embeddings)

    def build_faiss_index(self):
        dimension = self.embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(self.embeddings)
        return index

    def save_embeddings(self, path=EMBEDDINGS_PATH):
        ingredient_data = self.load_standardized_ingredients()
        with open(path, 'wb') as f:
            pickle.dump({
                'ingredients': ingredient_data,
                'embeddings': self.embeddings,
                'index': self.index
            }, f)

    def load_embeddings(self, path=EMBEDDINGS_PATH):
        with open(path, 'rb') as f:
            data = pickle.load(f)
            self.standardized_ingredients = data['ingredients']
            self.embeddings = data['embeddings']
            self.index = data['index']

    def initialize_embeddings(self):
        self.embeddings = self.generate_embeddings()
        self.index = self.build_faiss_index()
        self.standardized_ingredients = self.load_standardized_ingredients()

# Define IngredientMatcher class
class IngredientMatcher:
    def __init__(self, threshold=THRESHOLD):
        self.threshold = threshold
        self.device = self.get_device()
        self.embeddings = IngredientEmbeddings()

        if os.path.exists(EMBEDDINGS_PATH):
            self.embeddings.load_embeddings(EMBEDDINGS_PATH)
        else:
            self.embeddings.initialize_embeddings()
            self.embeddings.save_embeddings(EMBEDDINGS_PATH)

    def get_device(self):
        if torch.cuda.is_available():
            return 'cuda'
        else:
            return 'cpu'

    def get_embedding(self, ingredient):
        model = self.embeddings.model.to(self.device)
        return model.encode(
            [ingredient],
            convert_to_tensor=True,
            device=self.device
        ).cpu().numpy()

    def match_ingredient(self, ingredient):
        query_embedding = self.get_embedding(ingredient)
        distances, indices = self.embeddings.index.search(query_embedding, 1)
        closest_distance = distances[0][0]
        closest_index = indices[0][0]
        similarity = 1 / (1 + closest_distance)
        if similarity >= self.threshold:
            matched_name = self.embeddings.standardized_ingredients.iloc[closest_index]['ingredient_name']
            return matched_name, similarity
        else:
            return None, similarity

    def match_ingredients_list(self, ingredients):
        matched = {}
        for ingredient in ingredients:
            match, score = self.match_ingredient(ingredient)
            matched[ingredient] = {
                'matched_name': match,
                'similarity': score
            }
        return matched

# Main logic to standardize ingredients
def standardize_ingredients(user_ingredients, ai_ingredients):
    matcher = IngredientMatcher(threshold=THRESHOLD)

    print("Matching User Ingredients...")
    matched_user = matcher.match_ingredients_list(user_ingredients)

    print("Matching AI-Generated Ingredients...")
    matched_ai = matcher.match_ingredients_list(ai_ingredients)

    return matched_user, matched_ai

In [None]:
# Execute the main logic
matched_user, matched_ai = standardize_ingredients(USER_INGREDIENTS, AI_INGREDIENTS)

print("\nMatched User Ingredients:")
for k, v in matched_user.items():
    print(f"{k} -> {v['matched_name']} (Similarity: {v['similarity']:.2f})")

print("\nMatched AI-Generated Ingredients:")
for k, v in matched_ai.items():
    print(f"{k} -> {v['matched_name']} (Similarity: {v['similarity']:.2f})")