In [None]:
import time

notebook_start_time = time.time()

# Set up environment

In [None]:
import sys
from pathlib import Path


def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False


def clone_repository() -> None:
    !git clone https://github.com/decodingml/hands-on-recommender-system.git
    %cd hands-on-recommender-system/


def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

# 👩🏻‍🔬 Feature pipeline: Computing item embeddings

In this notebook you will compute the candidate embeddings and populate a Hopsworks feature group with a vector index.

## 📝 Imports

In [None]:
import warnings

warnings.filterwarnings("ignore")

from loguru import logger

from recsys import features, hopsworks_integration
from recsys.config import settings

## Constants

In [None]:
from pprint import pprint

pprint(dict(settings))

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [None]:
project, fs = hopsworks_integration.get_feature_store()

mr = project.get_model_registry()

# Computing candidate embeddings

You start by computing candidate embeddings for all items in the training data.

First, you load your candidate model. Recall that you uploaded it to the Hopsworks Model Registry in previous steps:

In [None]:
candidate_model, candidate_features = (
    hopsworks_integration.two_tower_serving.HopsworksCandidateModel.download(mr=mr)
)

### Get candidates data

Now, we get the training retrieval data containing all the features required for the candidate embedding model.

In [None]:
feature_view = fs.get_feature_view(
    name="retrieval",
    version=1,
)

In [None]:
train_df, val_df, test_df, _, _, _ = feature_view.train_validation_test_split(
    validation_size=settings.TWO_TOWER_DATASET_VALIDATON_SPLIT_SIZE,
    test_size=settings.TWO_TOWER_DATASET_TEST_SPLIT_SIZE,
    description="Retrieval dataset splits",
)
train_df.head(3)

### Compute embeddings

Next you compute the embeddings of all candidate items that were used to train the retrieval model.

In [None]:
item_df = features.embeddings.preprocess(train_df, candidate_features)
item_df.head(3)

In [None]:
embeddings_df = features.embeddings.embed(df=item_df, candidate_model=candidate_model)
embeddings_df.head()

# <span style="color:#ff5f27">Create Hopsworks Embedding Index </span>

Now you are ready to create a feature group for your candidate embeddings.

To begin with, you need to create your Embedding Index where you will specify the name of the embeddings feature and the embeddings length.
Then you attach this index to the FG.

In [None]:
candidate_embeddings_fg = (
    hopsworks_integration.feature_store.create_candidate_embeddings_feature_group(
        fs=fs, df=embeddings_df, online_enabled=True
    )
)
logger.info("✅ Uploaded 'candidate_embeddings' Feature Group to Hopsworks!!")

## Expose it to the feature pipeline as a Feature View


In [None]:
feature_view = (
    hopsworks_integration.feature_store.create_candidate_embeddings_feature_view(
        fs=fs, fg=candidate_embeddings_fg
    )
)

## <span style="color:#ff5f27"> Inspecting the embeddings in Hopsworks UI </span>

View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Feature Store → Feature Groups**

---

In [None]:
notebook_end_time = time.time()
notebook_execution_time = notebook_end_time - notebook_start_time

logger.info(
    f"⌛️ Notebook Execution time: {notebook_execution_time:.2f} seconds ~ {notebook_execution_time / 60:.2f} minutes"
)

# <span style="color:#ff5f27">→ Next Steps </span>

Now that we have our vector index populated with item embeddings, everything is ready for production. In the next notebook, we will zoom in into the inference pipeline and how we can deploy it to Hopsworks as a real-time deployment.