<a href="https://colab.research.google.com/github/empresario-ai-tech/ai-experiments/blob/dec-1-explore-embeddings/Embeddings/embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -r '/content/drive/MyDrive/Colab Notebooks/Embeddings/requirements.txt'

In [6]:
# import os

# BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# DATA_PATH = os.path.join(BASE_DIR, 'data')

DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/Embeddings"

In [3]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 2.2.2
    Uninstalling sentence-transformers-2.2.2:
      Successfully uninstalled sentence-transformers-2.2.2
Successfully installed sentence-transformers-3.3.1


In [4]:
!pip uninstall -y tensorflow

Found existing installation: tensorflow 2.15.0
Uninstalling tensorflow-2.15.0:
  Successfully uninstalled tensorflow-2.15.0


In [6]:
!pip install numpy==1.21.6

Collecting numpy==1.21.6
  Downloading numpy-1.21.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Downloading numpy-1.21.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dopamine-rl 4.0.9 requires tensorflow>=2.2.0, which is not installed.
tensorflow-text 2.15.0 requires tensorflow<2.16,>=2.15.0; platform_machine != "arm64" or platform_system != "Darwin", which is not installed.
chex 0.1.87 requires numpy>=1.24.1, but you have numpy 1.21.6 which is inco

In [3]:
!pip install matplotlib thinc gensim



In [2]:
!pip install tensorflow-cpu



In [1]:
!pip install tensorflow-cpu==2.11.0



In [2]:
!pip install transformers --upgrade



In [31]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import faiss
import pickle
import torch
import torch_xla.core.xla_model as xm
import os
from tqdm import tqdm

# Configuration
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/Embeddings"
EMBEDDINGS_PATH = '/content/ingredient_embeddings.pkl'
STANDARDIZED_INGREDIENTS_FILE = f'{DATA_PATH}/standardized_ingredients_dummy.csv'
MODEL_NAME = 'all-MiniLM-L6-v2'
BATCH_SIZE = 1000
THRESHOLD = 0.7

USER_INGREDIENTS = [
    "all purpose flour",
    "gran sugar",
    "olive oil",
    "black pepper",
    "chicken breasts",
    "eggs",
    "milk",
    "butter"
]

AI_INGREDIENTS = [
    "all-purpose flour",
    "granulated sugar",
    "extra virgin olive oil",
    "black pepper",
    "chicken breast",
    "egg",
    "whole milk",
    "unsalted butter"
]


In [None]:
# Define IngredientEmbeddings class
class IngredientEmbeddings:
    def __init__(self, model_name=MODEL_NAME, batch_size=BATCH_SIZE):
        self.device = self.get_device()
        self.model = SentenceTransformer(model_name).to(self.device)
        self.batch_size = batch_size
        self.standardized_ingredients = None
        self.embeddings = None
        self.index = None

    def get_device(self):
        if torch.cuda.is_available():
            return 'cuda'
        else:
            return 'cpu'

    def load_standardized_ingredients(self):
        return pd.read_csv(STANDARDIZED_INGREDIENTS_FILE)

    def generate_embeddings(self):
        all_embeddings = []
        for chunk in tqdm(pd.read_csv(STANDARDIZED_INGREDIENTS_FILE, chunksize=self.batch_size), desc="Generating Embeddings"):
            chunk_embeddings = self.model.encode(
                chunk['ingredient_name'].tolist(),
                convert_to_tensor=True,
                show_progress_bar=False,
                device=self.device
            )
            all_embeddings.append(chunk_embeddings.cpu().numpy())
        return np.vstack(all_embeddings)

    def build_faiss_index(self):
        dimension = self.embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(self.embeddings)
        return index

    def save_embeddings(self, path=EMBEDDINGS_PATH):
        ingredient_data = self.load_standardized_ingredients()
        with open(path, 'wb') as f:
            pickle.dump({
                'ingredients': ingredient_data,
                'embeddings': self.embeddings,
                'index': self.index
            }, f)

    def load_embeddings(self, path=EMBEDDINGS_PATH):
        with open(path, 'rb') as f:
            data = pickle.load(f)
            self.standardized_ingredients = data['ingredients']
            self.embeddings = data['embeddings']
            self.index = data['index']

    def initialize_embeddings(self):
        self.embeddings = self.generate_embeddings()
        self.index = self.build_faiss_index()
        self.standardized_ingredients = self.load_standardized_ingredients()

# Define IngredientMatcher class
class IngredientMatcher:
    def __init__(self, threshold=THRESHOLD):
        self.threshold = threshold
        self.device = self.get_device()
        self.embeddings = IngredientEmbeddings()
        
        if os.path.exists(EMBEDDINGS_PATH):
            self.embeddings.load_embeddings(EMBEDDINGS_PATH)
        else:
            self.embeddings.initialize_embeddings()
            self.embeddings.save_embeddings(EMBEDDINGS_PATH)
    
    def get_device(self):
        if torch.cuda.is_available():
            return 'cuda'
        else:
            return 'cpu'

    def get_embedding(self, ingredient):
        model = self.embeddings.model.to(self.device)
        return model.encode(
            [ingredient], 
            convert_to_tensor=True, 
            device=self.device
        ).cpu().numpy()

    def match_ingredient(self, ingredient):
        query_embedding = self.get_embedding(ingredient)
        distances, indices = self.embeddings.index.search(query_embedding, 1)
        closest_distance = distances[0][0]
        closest_index = indices[0][0]
        similarity = 1 / (1 + closest_distance)
        if similarity >= self.threshold:
            matched_name = self.embeddings.standardized_ingredients.iloc[closest_index]['ingredient_name']
            return matched_name, similarity
        else:
            return None, similarity

    def match_ingredients_list(self, ingredients):
        matched = {}
        for ingredient in ingredients:
            match, score = self.match_ingredient(ingredient)
            matched[ingredient] = {
                'matched_name': match,
                'similarity': score
            }
        return matched 

# Main logic to standardize ingredients
def standardize_ingredients(user_ingredients, ai_ingredients):
    matcher = IngredientMatcher(threshold=THRESHOLD)
    
    print("Matching User Ingredients...")
    matched_user = matcher.match_ingredients_list(user_ingredients)
    
    print("Matching AI-Generated Ingredients...")
    matched_ai = matcher.match_ingredients_list(ai_ingredients)
    
    return matched_user, matched_ai

In [32]:
from sentence_transformers import SentenceTransformer
import numpy as np
import torch
import torch_xla.core.xla_model as xm

class IngredientMatcher:
    def __init__(self, threshold=0.7):
        self.threshold = threshold
        self.device = self.get_device()
        print(self.device)
        self.embeddings = IngredientEmbeddings()
        try:
            print('embedding already present')
            self.embeddings.load_embeddings()
        except FileNotFoundError:
            self.embeddings.generate_embeddings()
            self.embeddings.save_embeddings()

    def get_device(self):
        if 'COLAB_TPU_ADDR' in os.environ:
            return xm.xla_device()
        elif torch.cuda.is_available():
            return 'cuda'
        else:
            return xm.xla_device()

    def get_embedding(self, ingredient):
        model = self.embeddings.model.to(self.device)
        return model.encode(
            [ingredient],
            convert_to_tensor=True,
            device=self.device
        ).cpu().numpy()

    def match_ingredient(self, ingredient):
        query_embedding = self.get_embedding(ingredient)
        distances, indices = self.embeddings.index.search(query_embedding, 1)
        closest_distance = distances[0][0]
        closest_index = indices[0][0]
        similarity = 1 / (1 + closest_distance)  # Convert L2 distance to similarity
        if similarity >= self.threshold:
            return self.embeddings.standardized_ingredients[closest_index], similarity
        else:
            return None, similarity

    def match_ingredients_list(self, ingredients):
        matched = {}
        for ingredient in ingredients:
            match, score = self.match_ingredient(ingredient)
            matched[ingredient] = {
                'matched_name': match,
                'similarity': score
            }
        return matched

In [33]:
# Execute the main logic
matched_user, matched_ai = standardize_ingredients(USER_INGREDIENTS, AI_INGREDIENTS)

print("\nMatched User Ingredients:")
for k, v in matched_user.items():
    print(f"{k} -> {v['matched_name']} (Similarity: {v['similarity']:.2f})")

print("\nMatched AI-Generated Ingredients:")
for k, v in matched_ai.items():
    print(f"{k} -> {v['matched_name']} (Similarity: {v['similarity']:.2f})")

xla:0
xla:0


AttributeError: 'NoneType' object has no attribute 'shape'