In [75]:
import io
import subprocess
import pickle

In [77]:
model_data = subprocess.check_output(["gsutil", "cat", "gs://bucket-for-testing-madrid/km_model_OpenAi.pkl"])

In [78]:
with io.BytesIO(model_data) as f:
    loaded_model = pickle.load(f)

In [79]:
loaded_model

In [80]:
# Download the file using gsutil with a local path
local_model_path = "local_model.pkl"
subprocess.run(["gsutil", "cp", "gs://bucket-for-testing-madrid/km_model_OpenAi.pkl", local_model_path])


Copying gs://bucket-for-testing-madrid/km_model_OpenAi.pkl...
/ [1 files][983.6 KiB/983.6 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.


Operation completed over 1 objects/983.6 KiB.                                    


CompletedProcess(args=['gsutil', 'cp', 'gs://bucket-for-testing-madrid/km_model_OpenAi.pkl', 'local_model.pkl'], returncode=0)

In [81]:
with open(local_model_path, "rb") as f:
    model = pickle.load(f)

In [82]:
model

In [83]:
import pickle
import os
from sklearn.cluster import KMeans
import openai
from sklearn.metrics.pairwise import cosine_similarity
from params import *
import numpy as np
from google.cloud import storage
import io
import subprocess

from st_files_connection import FilesConnection


In [84]:
conn = st.connection('gcs', type=FilesConnection)

In [85]:
def load_model():
    try:
        model_data = subprocess.check_output(["gsutil", "cat", "gs://bucket-for-testing-madrid/km_model_OpenAi.pkl"])
        with io.BytesIO(model_data) as f:
            loaded_model = pickle.load(f)
        return loaded_model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

In [96]:
load_model()

In [97]:
ingredients_text = ['tomato','pasta']
ingredients_text

['tomato', 'pasta']

In [102]:
def get_embedding(ingredients_text):
    '''Get embedding of the ingredients text'''
    openai_model = "text-embedding-ada-002"
    openai.api_key = OPENAI_KEY
    igre_embedding = openai.embeddings.create(input = ingredients_text, model = openai_model)
    print(len(np.array(igre_embedding.data[0].embedding)))

    return np.array(igre_embedding.data[0].embedding)

In [103]:
igre_embedding = get_embedding(ingredients_text)
igre_embedding

1536


array([ 0.01146588, -0.01494648, -0.01048716, ..., -0.00677971,
       -0.0253429 , -0.01432425])

In [109]:
def get_cosine(igre_embedding):
    ''' Get cosine matrix vs. trained embeddings'''

    if LOAD_MODEL == "gcp":
        # Specify your bucket name and file name
        # bucket_name = BUCKET_NAME
        # blob_name = 'ten_embeddings_temp_array_nom.pkl'

        # # Initialize the client
        # client = storage.Client()

        # # Get the bucket and blob
        # bucket = client.get_bucket(bucket_name)
        # blob = bucket.blob(blob_name)

        # # Download the blob to an in-memory file
        # in_memory_file = io.BytesIO()
        # blob.download_to_file(in_memory_file)
        # in_memory_file.seek(0)  # Important: move back to the start of the file before reading

        # Load the model directly from the in-memory file
        # dataset_embeddings_10 = pickle.load(in_memory_file)
        ten_embed = subprocess.check_output(["gsutil", "cat", "gs://bucket-for-testing-madrid/ten_embeddings_temp_array_nom.pkl"])
        with io.BytesIO(ten_embed) as f:
            dataset_embeddings_10 = pickle.load(f)
        print(len(dataset_embeddings_10))

    else:
        parent_dir = os.getcwd()
        filepath = os.path.join(parent_dir, "raw_data", "ten_embeddings_temp_array_nom.pkl")
        dataset_embeddings_10 = pickle.load(open(filepath,"rb"))

    ingre_embedding_reshapped = igre_embedding.reshape(1, 1536)
    print(len(ingre_embedding_reshapped))
    cos_sim_ingre_embed = cosine_similarity(ingre_embedding_reshapped, dataset_embeddings_10)
    print(len(cos_sim_ingre_embed))

    return cos_sim_ingre_embed

In [110]:
# def get_cosine(igre_embedding):
#     if LOAD_MODEL == "gcp":
#         bucket_name = BUCKET_NAME
#         blob_name = 'ten_embeddings_temp_array_nom.pkl'

#         try:
#             # Initialize the client
#             client = storage.Client()

#             # Get the bucket and blob
#             bucket = client.get_bucket(bucket_name)
#             print(bucket)
#             blob = bucket.blob(blob_name)
#             print(blob)

#             # Download the blob to an in-memory file (optional)
#             # in_memory_file = io.BytesIO()
#             # blob.download_to_file(in_memory_file)
#             # in_memory_file.seek(0)  # Important: move back to the start of the file

#             # Load the model directly from the downloaded blob (preferred)
#             dataset_embeddings_10 = pickle.loads(blob.download_as_string())
#             print(len(dataset_embeddings_10))
#             return dataset_embeddings_10
#         except Exception as e:
#             print(f"Error loading embeddings from GCS: {e}")
#             return None
#     else:
#         parent_dir = os.getcwd()
#         filepath = os.path.join(parent_dir, "raw_data", "ten_embeddings_temp_array_nom.pkl")
#         dataset_embeddings_10 = pickle.load(open(filepath,"rb"))
#         print(len(dataset_embeddings_10))
#     ingre_embedding_reshapped = igre_embedding.reshape(1, 1536)
#     cos_sim_ingre_embed = cosine_similarity(ingre_embedding_reshapped, dataset_embeddings_10)
#     print(ingre_embedding_reshapped)
#     print(len(ingre_embedding_reshapped))
#     print(cos_sim_ingre_embed)
#     print(len(cos_sim_ingre_embed))
#     return cos_sim_ingre_embed


In [111]:
cosine = get_cosine(igre_embedding)
cosine

10
1
1


array([[0.79321886, 0.79137911, 0.83141342, 0.80793341, 0.81661029,
        0.78182595, 0.8022221 , 0.77803703, 0.78211731, 0.77299643]])

In [112]:
def get_cluster(ingredients_text):
    '''Get Cluster based on ingredients'''

    #Load Model
    model = load_model()
    print(model)

    # Get embedding of the ingredients text
    ingre_embedding = get_embedding(ingredients_text)
    print(len(ingre_embedding))

    # Get cosine matrix vs. trained embeddings
    cos_sim_ingre_embed = get_cosine(ingre_embedding)
    print(len(cos_sim_ingre_embed))

    # Get clusters
    cluster_label = model.predict(cos_sim_ingre_embed)
    print(cluster_label)

    print("\n✅ get_cluster() done \n")
    print(f"Cluster label for ingredients '{ingredients_text}' is {cluster_label[0]}\n")

    return cluster_label[0]

In [113]:
get_cluster(ingredients_text)

KMeans(n_clusters=1000)
1536
1536
10
1
1
1
[247]

✅ get_cluster() done 

Cluster label for ingredients '['tomato', 'pasta']' is 247



247