# OpenAI embeddings for recommendations
## 0. Imports

In [16]:
import pandas as pd

import numpy as np 
from typing import Any
from enum import Enum
import json
import math
import pickle 

from main.openai.openai_utils.openai_utils import embedding_from_string

from main.data.session_dataset import *
from main.abstract_model import Model

from main.utils.top_k_computer import TopKComputer
from main.utils.split_dict import split_dict


In [17]:
WORKING_DIR = "../../../beauty"
EMBEDDINGS_NAME = "product_embeddings_openai"
WEIGHT_DISTRIBUTION = "LAST_ONLY"
SIMILARITY_MEASURE = "DOT"
TOP_K = 20

## 1. Load session dataset.

In [18]:
dataset: SessionDataset = SessionDataset.from_pickle(f"{WORKING_DIR}/dataset.pickle")

In [19]:
test = dataset.get_test_prompts()
list(test.items())[:10]

[(13,
  array([  698,  9159,  2331,  3611,  4673,  8027,  8386, 11818,  2222,
         11133,  8238,  8563, 11553,  4327,  8943,  2532,  2325,  8553,
          4357,  4011,  5162,  1928, 11205,    43,  4448,  9002,  3876,
          5101, 10491,  8672,  4324,  4299,  7120,  2250,  7784,  2158,
          7427])),
 (15, array([11846,  6425, 11111,  1817])),
 (18, array([1276, 3548, 4352, 4045, 7686, 2504])),
 (20, array([1360, 3061, 4039, 9559, 4992, 1352, 3736, 7471, 6476, 7019])),
 (23,
  array([ 6175, 10787,  2454,  9355,  2460, 10037,  6438,    97,  8798,
          8573, 11058,  5469, 10515,  5867,  4597,  4688,  1643,  6012,
          6020,  9745, 11764,  9240,  2899,  3731])),
 (31, array([ 9593, 11978, 11316,  5028,   869, 11310,  1386,  1246])),
 (36,
  array([ 5977,  9330,  3059,  2020,  3475, 10687, 12050, 10527,  6672,
          9359,  5210, 10821,  2943, 11052,  1582, 10114,  7901])),
 (38, array([ 1734,  3800,  6938, 11352,  3086,  9211])),
 (39, array([2702, 7566, 7636, 3102

## 2. Load auxiliary data, including embeddings

In [20]:
product_embeddings = pd.read_csv(f"{WORKING_DIR}/{EMBEDDINGS_NAME}.csv.gzip", compression="gzip")
product_embeddings.head(10)

Unnamed: 0.1,Unnamed: 0,global_product_id,name,ada_embedding
0,0,1504,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,"[-0.008468648418784142, 0.014345130883157253, ..."
1,1,564,Xtreme Brite Brightening Gel 1oz.,"[0.019681310281157494, 0.009377948939800262, -..."
2,2,9963,Prada Candy By Prada Eau De Parfum Spray 1.7 O...,"[-0.00300808809697628, -0.007103437092155218, ..."
3,3,9839,Versace Bright Crystal Eau de Toilette Spray f...,"[0.0053097945638000965, 0.0017624408937990665,..."
4,4,4132,Stella McCartney Stella,"[-0.006986561696976423, -0.0015255995094776154..."
5,5,438,Avalon Biotin B-Complex Thickening Conditioner...,"[-0.011060410179197788, -0.017783403396606445,..."
6,6,9381,"Better Living Classic Two Chamber Dispenser, W...","[-0.004991547204554081, 0.019236043095588684, ..."
7,7,3618,Better Living The Ulti-Mate Dispenser,"[-0.007757279556244612, 0.014554604887962341, ..."
8,8,11467,Crabtree and Evelyn - Gardener's Ultra-Moist...,"[-0.010174826718866825, -0.0013344729086384177..."
9,9,6073,Crabtree and Evelyn 2792 Gardeners Hand Ther...,"[0.0050934250466525555, -0.00951578002423048, ..."


In [21]:
product_id_to_name = product_embeddings[["global_product_id", "name"]].set_index("global_product_id").to_dict()["name"]
list(product_id_to_name.items())[:10]

[(1504,
  'WAWO 15 Color Professionl Makeup Eyeshadow Camouflage Facial Concealer Neutral Palette'),
 (564, 'Xtreme Brite Brightening Gel 1oz.'),
 (9963, 'Prada Candy By Prada Eau De Parfum Spray 1.7 Oz For Women'),
 (9839, 'Versace Bright Crystal Eau de Toilette Spray for Women, 3 Ounce'),
 (4132, 'Stella McCartney Stella'),
 (438, 'Avalon Biotin B-Complex Thickening Conditioner, 14 Ounce'),
 (9381, 'Better Living Classic Two Chamber Dispenser, White'),
 (3618, 'Better Living The Ulti-Mate Dispenser'),
 (11467,
  "Crabtree  and  Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ"),
 (6073, 'Crabtree  and  Evelyn 2792 Gardeners Hand Therapy (100ml, 3.4 oz)')]

In [22]:
product_index_to_embedding = product_embeddings[["global_product_id", "ada_embedding"]].set_index("global_product_id").to_dict()["ada_embedding"]
product_index_to_embedding = {k: np.array(json.loads(v)) for k, v in product_index_to_embedding.items()}
product_index_to_embedding = np.array(list(product_index_to_embedding.values()))

In [23]:
product_index_to_embedding.shape

(12101, 1536)

In [24]:
product_index_to_id = list(product_id_to_name.keys())
product_id_to_index = {id: index for index, id in enumerate(product_index_to_id)}

## 3. Define a session-embedding based recommender

In [25]:
class OpenAIEmbedder(Model):
    def __init__(self, pred_seen : bool = False, is_verbose: bool = False, cores: int = 1) -> None:

        self.pred_seen = pred_seen
        super().__init__(is_verbose, cores)
    
    def train(self, train_data: dict[int, np.ndarray]) -> None:
        # We do not have to train.
        self.is_trained = True 
    
    def predict(self, predict_data: dict[int, np.ndarray], top_k: int = 10) -> dict[int, np.ndarray]:
        global product_index_to_embedding
        global product_index_to_id
        global product_id_to_index

        self.num_items = product_index_to_embedding.shape[0]

        recommendations = {}

        # Split sessions into batches.
        num_batches = math.ceil(len(predict_data) / 500)
        session_batches = split_dict(predict_data, num_batches)

        # Calculate recommendations for batches
        for i, session_batch in enumerate(session_batches):
            print(f"Batch {i} of {len(session_batches)}", end="\r")

            session_embeddings = []
            dense_interacted_items_batch = []

            # Loop through the session interactions..
            for interactions in session_batch.values():
                # Get embeddings of all the interacted items in the session.
                interactions_indices = [product_id_to_index[item] for item in interactions]
                cur_item_embeddings = product_index_to_embedding[interactions_indices]

                # Sum embeddings to get a session embedding.
                weights = None 
                if WEIGHT_DISTRIBUTION == "CONSTANT": 
                    weights = np.ones(len(interactions)) 
                elif WEIGHT_DISTRIBUTION == "LINEAR": 
                    weights = np.arange(1, len(interactions) + 1)
                elif WEIGHT_DISTRIBUTION == "QUADRATIC": 
                    linear_weights = np.arange(1, len(interactions) + 1)
                    weights = np.multiply(linear_weights, linear_weights)
                elif WEIGHT_DISTRIBUTION == "CUBIC": 
                    linear_weights = np.arange(1, len(interactions) + 1)
                    quadratic_weights = np.multiply(linear_weights, linear_weights)
                    weights = np.multiply(quadratic_weights, linear_weights)
                elif WEIGHT_DISTRIBUTION == "LAST_ONLY": 
                    weights = np.array([0 if i < len(interactions) else 1 for i in np.arange(1, len(interactions) + 1)])
                else: 
                    raise ValueError("Unknown weight distribution")
                
                session_embedding = np.average(cur_item_embeddings, axis=0, weights=weights)

                # Get a dense multi-hot of the interacted items.
                dense_interacted_items = np.zeros(self.num_items)
                dense_interacted_items[interactions_indices] = 1

                # session_product_names = [product_id_to_name[item] for item in interactions]
                # session_string = ", ".join(session_product_names)
                # print(session_string)
                # session_embedding = embedding_from_string(session_string)

                # Append results.
                session_embeddings.append(session_embedding)
                dense_interacted_items_batch.append(dense_interacted_items)
            
            # Convert batch results to numpy arrays.
            session_embeddings = np.array(session_embeddings)
            dense_interacted_items_batch = np.array(dense_interacted_items_batch)

            # Get predictions for this batch.
            if SIMILARITY_MEASURE == "DOT":
                predictions_batch = session_embeddings @ product_index_to_embedding.T
            elif SIMILARITY_MEASURE == "COS": 
                dot = session_embeddings @ product_index_to_embedding.T
                predictions_batch = dot / (np.linalg.norm(session_embeddings) * np.linalg.norm(product_index_to_embedding))
            elif SIMILARITY_MEASURE == "EUCL": 
                session_embedding_rep = np.repeat(session_embeddings[:, :, np.newaxis], len(product_index_to_embedding), axis=2)
                session_embedding_rep = np.transpose(session_embedding_rep, axes=(0, 2, 1))
                predictions_batch = -1 * np.linalg.norm(session_embedding_rep - product_index_to_embedding, axis=2)
            else: 
                raise ValueError("Unknown similarity measure")
            
            if not(self.pred_seen):
                # Remove items already in batch from predictions.
                allowed_items = 0 - dense_interacted_items_batch * 1_000_000_000
                predictions_batch = np.add(predictions_batch, allowed_items)

            # Get top-k items.
            top_k_batch = TopKComputer.compute_top_k(predictions_batch, top_k)
            top_k_batch_shape = top_k_batch.shape
            top_k_batch = np.ndarray.flatten(top_k_batch)
            top_k_batch = [product_index_to_id[item] for item in top_k_batch]
            top_k_batch = np.reshape(top_k_batch, top_k_batch_shape)
            
            # Update recommendations.
            recommendations_batch = dict(zip(session_batch.keys(), top_k_batch))
            recommendations.update(recommendations_batch)

        return recommendations 
    
    def name(self) -> str:
        return f"Embeddings"

## 4. Show some example recommendations

In [26]:
VERBOSE = True

num_sessions_to_show = 5

# Create model.
open_ai_add = OpenAIEmbedder(is_verbose=VERBOSE)
open_ai_add.train(None)

# Create test sessions.
test_sessions = {k: test[k] for k in list(test.keys())[:num_sessions_to_show]}
recommendations = open_ai_add.predict(test_sessions, top_k=TOP_K)

# Print input-recommendation pairs per session.
for session_id in test_sessions.keys():
    session_interacted_items = test_sessions[session_id]
    session_recommendations = recommendations[session_id]

    user_interactions_names = [product_id_to_name[item] for item in session_interacted_items]
    user_recommendation_names = [product_id_to_name[item] for item in session_recommendations]
    print("Interactions: ")
    print(user_interactions_names)
    print("Recommendations:")
    print(user_recommendation_names)
    print("\n")


Interactions: 
['Biolage by Matrix Conditioning Balm 16.9 Ounces', 'Q-tips Cotton Swabs, 500 Count', "Murphy's Oil Soap, 32-Ounce", 'Suave Professionals Humectant Moisture Shampoo , 12.6 fl Ounce (373 ml)', 'Head  and  Shoulders Classic Clean 2 in 1 Dandruff Shampoo  and  Conditioner 23.7 Fluid ounce (Pack of 2) (packaging may vary)', 'Neutrogena Makeup Remover Cleansing Towelettes, Refill Pack, 25 Count (Pack of 3)', 'Olay Daily Facials Express Wet Cleansing Cloths, All Skin Types, 30 Count  (Pack of 4)', 'John Frieda Brilliant Brunette Shine Shock Glosser - 2.4 oz', 'Tresemme Smooth  and  Silky Shampoo with Moroccan Argan Oil, 32 Ounce', 'Tresemme Smooth and Silky Conditioner, 32 Ounce', 'Olay Quench Ultra Moisture Lotion with Shea Butter 20.2 Fl Oz (Pack of 2)', 'Ivory Lavender Body Wash 24 Fl Oz (Pack of 6)', 'Fekkai Glossing Hair Products Starter 1 Kit', 'Olay Regenerist Micro-Sculpting Serum 1.7 Fl Oz', 'Selsun Blue Dandruff Shampoo, Moisturizing with Aloe for Dry Scalp and Hair,

## 5. Produce recommendations for all test sessions

In [27]:
model = OpenAIEmbedder(is_verbose=VERBOSE)
model.train(None)

In [28]:
recommendations: dict[int, np.ndarray] = model.predict(test, top_k=TOP_K)

Batch 8 of 9

## 6. Store results

In [29]:
predictions_pickle: bytes = pickle.dumps(recommendations)

In [30]:
rec_name = f"../../../results/recs_{EMBEDDINGS_NAME}_{WEIGHT_DISTRIBUTION}_{SIMILARITY_MEASURE}.pickle"
with open(rec_name, "wb") as file: 
    file.write(predictions_pickle)