In [6]:
import time

notebook_start_time = time.time()

In [None]:
import sys
from pathlib import Path

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/decodingml/hands-on-recommender-system.git
    %cd hands-on-recommender-system/

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute()) 
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)

## <span style="color:#ff5f27">👨🏻‍🏫 Item candidates embeddings creation </span>

In this notebook you will create a feature group for your candidate embeddings.

In [7]:
import sys
from pathlib import Path

root_dir = str(Path().absolute().parent)
if root_dir not in sys.path:
    sys.path.append(root_dir)

## <span style="color:#ff5f27">📝 Imports </span>

In [9]:
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

from recsys import utils
from recsys import features
from recsys.models.two_tower_serving import CandidateModelModule

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [10]:
project, fs = utils.get_hopsworks_feature_store()

mr = project.get_model_registry()

[32m2024-11-09 18:14:17.288[0m | [1mINFO    [0m | [36mrecsys.utils[0m:[36mget_hopsworks_feature_store[0m:[36m10[0m - [1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.[0m


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/15551
Connected. Call `.close()` to terminate connection gracefully.
Connected. Call `.close()` to terminate connection gracefully.


## <span style="color:#ff5f27">🎯 Compute Candidate Embeddings </span>

You start by computing candidate embeddings for all items in the training data.

First, you load your candidate model. Recall that you uploaded it to the Hopsworks Model Registry in the previous notebook. If you don't have the model locally you can download it from the Model Registry using the following code:

In [11]:
candidate_model, model_schema = CandidateModelModule.load_from_hopsworks(mr=mr)

Downloading model artifact (2 dirs, 6 files)... DONE

Next you compute the embeddings of all candidate items that were used to train the retrieval model.

In [12]:
feature_view = fs.get_feature_view(
    name="retrieval", 
    version=1,
)

In [13]:
train_df, val_df, test_df, _, _, _ = feature_view.train_validation_test_split(
    validation_size=0.1, 
    test_size=0.1,
    description='Retrieval dataset splits',
)
train_df.head(3)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (119.67s) 


Unnamed: 0,customer_id,article_id,t_dat,price,month_sin,month_cos,age,club_member_status,age_group,garment_group_name,index_group_name
0,699d60943d900160e5b6a266597fec99b41aee568713ea...,697201002,1559347200000,0.030492,1.224647e-16,-1.0,62.0,ACTIVE,56-65,Trousers,Ladieswear
1,563537e896b37b7e90d02468d2c1531ef8fd5ffe083e78...,727039001,1564704000000,0.015237,-0.8660254,-0.5,32.0,ACTIVE,26-35,Jersey Fancy,Ladieswear
2,699d60943d900160e5b6a266597fec99b41aee568713ea...,744712003,1559347200000,0.030492,1.224647e-16,-1.0,62.0,ACTIVE,56-65,Jersey Fancy,Ladieswear


In [14]:
item_df = features.embeddings.preprocess(train_df, model_schema)
item_df.head(3)

Unnamed: 0,article_id,garment_group_name,index_group_name
0,697201002,Trousers,Ladieswear
1,727039001,Jersey Fancy,Ladieswear
2,744712003,Jersey Fancy,Ladieswear


In [15]:
# Create a TensorFlow dataset from the item DataFrame
item_ds = tf.data.Dataset.from_tensor_slices(
    {col: item_df[col] for col in item_df})

# Compute embeddings for all candidate items using the candidate_model
candidate_embeddings = item_ds.batch(2048).map(
    lambda x: (x["article_id"], candidate_model(x))
)

> Strictly speaking, you haven't actually computed the candidate embeddings yet, as the dataset functions are lazily evaluated.

## <span style="color:#ff5f27">⚙️ Data Preparation </span>


In [16]:
# Create a DataFrame
data_emb = features.embeddings.postprocess(candidate_embeddings)

data_emb.head()

Unnamed: 0,article_id,embeddings
0,697201002,"[1.3150668144226074, -0.4790515899658203, 0.30..."
1,727039001,"[0.6000490784645081, -0.19955429434776306, -0...."
2,744712003,"[-0.12406323850154877, 0.1571202129125595, 0.5..."
3,625545003,"[0.32176533341407776, 0.11810187250375748, 0.1..."
4,636207006,"[-0.8955077528953552, 0.27362650632858276, 0.1..."


## <span style="color:#ff5f27">🪄 Feature Group Creation </span>

Now you are ready to create a feature group for your candidate embeddings.

To begin with, you need to create your Embedding Index where you will specify the name of the embeddings feature and the embeddings length.
Then you attach this index to the FG.

In [17]:
from hsfs import embedding

# Create the Embedding Index
emb = embedding.EmbeddingIndex()

emb.add_embedding(
    "embeddings",                           # Embeddings feature name
    len(data_emb["embeddings"].iloc[0]),    # Embeddings length
)

In [18]:
# Get or create the 'candidate_embeddings_fg' feature group
candidate_embeddings_fg = fs.get_or_create_feature_group(
    name="candidate_embeddings_fg",
    embedding_index=emb,                    # Specify the Embedding Index
    primary_key=['article_id'],
    version=1,
    description='Embeddings for each article',
    online_enabled=True,
)

candidate_embeddings_fg.insert(data_emb)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/15551/fs/15471/fg/1343841


Uploading Dataframe: 0.00% |          | Rows 0/11895 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: candidate_embeddings_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/15551/jobs/named/candidate_embeddings_fg_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x355be9310>, None)

## <span style="color:#ff5f27">🪄 Feature View Creation </span>


In [19]:
# Get or create the 'candidate_embeddings' feature view
feature_view = fs.get_or_create_feature_view(
    name="candidate_embeddings",
    version=1,
    description='Embeddings of each article',
    query=candidate_embeddings_fg.select(["article_id"]),
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/15551/fs/15471/fv/candidate_embeddings/version/1


---

In [20]:
# End the timer
notebook_end_time = time.time()

# Calculate and print the execution time
notebook_execution_time = notebook_end_time - notebook_start_time
print(f"⌛️ Notebook Execution time: {notebook_execution_time:.2f} seconds")

⌛️ Notebook Execution time: 152.71 seconds


---
## <span style="color:#ff5f27">⏩️ Next Steps </span>

At this point you have a recommender system that is able to generate a set of candidate items for a customer. However, many of these could be poor, as the candidate model was trained with only a few subset of the features. In the next notebook, you'll create a ranking dataset to train a *ranking model* to do more fine-grained predictions.