In [1]:
import time

# Start the timer
notebook_start_time = time.time()

## <span style="color:#ff5f27">👩🏻‍🔬 Feature Engineering </span>

Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.

In [2]:
import sys
from pathlib import Path

root_dir = str(Path().absolute().parent)
if root_dir not in sys.path:
    sys.path.append(root_dir)

In [3]:
import os


def need_download_modules():
    if "google.colab" in str(get_ipython()):
        return True
    return False


# TODO: Fix Colab support
if need_download_modules():
    print("📥 Downloading modules")
    os.system("mkdir -p features")
    os.system(
        "cd features && wget https://raw.githubusercontent.com/Maxxx-zh/hopsworks-tutorials/refs/heads/FSTORE-1565/advanced_tutorials/recommender-system/features/articles.py"
    )
    os.system(
        "cd features && wget https://raw.githubusercontent.com/Maxxx-zh/hopsworks-tutorials/refs/heads/FSTORE-1565/advanced_tutorials/recommender-system/features/customers.py"
    )
    os.system(
        "cd features && wget https://raw.githubusercontent.com/Maxxx-zh/hopsworks-tutorials/refs/heads/FSTORE-1565/advanced_tutorials/recommender-system/features/interaction.py"
    )
    os.system(
        "cd features && wget https://raw.githubusercontent.com/Maxxx-zh/hopsworks-tutorials/refs/heads/FSTORE-1565/advanced_tutorials/recommender-system/features/ranking.py"
    )
    os.system(
        "cd features && wget https://raw.githubusercontent.com/Maxxx-zh/hopsworks-tutorials/refs/heads/FSTORE-1565/advanced_tutorials/recommender-system/features/transactions.py"
    )

else:
    print("⛳️ Local environment")

⛳️ Local environment


## <span style="color:#ff5f27">📝 Imports </span>

In [4]:
# TODO: How to adapt this with UV?
# !pip install -r requirements.txt

In [5]:
%load_ext autoreload
%autoreload 2

import random
import warnings

import polars as pl
import torch
from sentence_transformers import SentenceTransformer
from loguru import logger

warnings.filterwarnings("ignore")

from recsys import utils
from recsys.data import raw_data
from recsys.features.articles import (
    get_image_url,
    prepare_articles,
    generate_embeddings_for_dataframe
)
from recsys.features.customers import DatasetSize, prepare_customers
from recsys.features.interaction import generate_interaction_data
from recsys.features.ranking import compute_ranking_dataset
from recsys.features.transactions import prepare_transactions

## <span style="color:#ff5f27">📝 Constants </span>

In [6]:
# Data size configuration
DATA_SIZE = DatasetSize.SMALL

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [7]:
project, fs = utils.get_hopsworks_feature_store()

[32m2024-11-08 18:16:38.897[0m | [1mINFO    [0m | [36mrecsys.utils[0m:[36mget_hopsworks_feature_store[0m:[36m10[0m - [1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.[0m


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/15551
Connected. Call `.close()` to terminate connection gracefully.


## <span style="color:#ff5f27">🗄️ Read Articles Data</span>

The **article_id** and **product_code** serve different purposes in the context of H&M's product database:

- **Article ID**: This is a unique identifier assigned to each individual article within the database. It is typically used for internal tracking and management purposes. Each distinct item or variant of a product (e.g., different sizes or colors) would have its own unique article_id.

- **Product Code**: This is also a unique identifier, but it is associated with a specific product or style rather than individual articles. It represents a broader category or type of product within H&M's inventory. Multiple articles may share the same product code if they belong to the same product line or style.

While both are unique identifiers, the article_id is specific to individual items, whereas the product_code represents a broader category or style of product.

Here is an example:

**Product: Basic T-Shirt**

- **Product Code:** TS001

- **Article IDs:**
    - Article ID: 1001 (Size: Small, Color: White)
    - Article ID: 1002 (Size: Medium, Color: White)
    - Article ID: 1003 (Size: Large, Color: White)
    - Article ID: 1004 (Size: Small, Color: Black)
    - Article ID: 1005 (Size: Medium, Color: Black)

In this example, "TS001" is the product code for the basic t-shirt style. Each variant of this t-shirt (e.g., different sizes and colors) has its own unique article_id.



In [8]:
articles_df = raw_data.extract_articles_df()

logger.info(articles_df.shape)
articles_df.head(3)

[32m2024-11-08 18:16:42.172[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1m(105542, 25)[0m


CPU times: user 165 ms, sys: 51.4 ms, total: 216 ms
Wall time: 1.14 s


article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
i64,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,str
108775015,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775044,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775051,108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"


In [9]:
# Check for NaNs
articles_df.null_count()

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,416


# <span style="color:#ff5f27">👨🏻‍🏭 Articles Feature Engineering</span>


In [10]:
articles_df = prepare_articles(articles_df)
articles_df.head(3)

CPU times: user 233 ms, sys: 5.56 ms, total: 238 ms
Wall time: 242 ms


article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,prod_name_length,article_description
str,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,u32,str
"""108775015""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…"
"""108775044""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…"
"""108775051""",108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",13,"""Strap top (1) - Vest top in Ga…"


In [11]:
logger.info(articles_df["article_description"][0])

[32m2024-11-08 18:16:42.578[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mStrap top - Vest top in Garment Upper body
Appearance: Solid
Color: Dark Black (Black)
Category: Ladieswear - Womens Everyday Basics - Jersey Basic
Details: Jersey top with narrow shoulder straps.[0m


### <span style="color:#ff5f27">🧬 Create embeddings </span>

In [12]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
logger.info(f"Loading embedding model to {device=}")

# Load the SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

[32m2024-11-08 18:16:42.675[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mLoading embedding model to device='mps'[0m


2024-11-08 18:16:42,675 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [13]:
# Reduce batch size if getting OOM errors.
articles_df = generate_embeddings_for_dataframe(articles_df, 'article_description', model, device, batch_size=32)

Generating embeddings:   0%|          | 0/105542 [00:00<?, ?it/s]

CPU times: user 2min 28s, sys: 21.4 s, total: 2min 50s
Wall time: 2min 50s


In [14]:
articles_df[["article_description", "embeddings"]].head(3)

article_description,embeddings
str,list[f64]
"""Strap top - Vest top in Garmen…","[-0.026782, 0.082344, … 0.022782]"
"""Strap top - Vest top in Garmen…","[-0.010396, 0.089874, … 0.022564]"
"""Strap top (1) - Vest top in Ga…","[-0.032753, 0.091124, … 0.022804]"


### <span style="color:#ff5f27">🔗 Image Links</span>

In [15]:
articles_df = articles_df.with_columns(
    image_url=pl.col("article_id").map_elements(get_image_url)
)
articles_df["image_url"][0]

'https://repo.hops.works/dev/jdowling/h-and-m/images/010/0108775015.jpg'

---
## <span style="color:#ff5f27">🗄️ Read Customers Data</span>

In [16]:
customers_df = raw_data.extract_customers_df()
logger.info(customers_df.shape)
customers_df.head(3)

[32m2024-11-08 18:20:27.396[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1m(1371980, 7)[0m


CPU times: user 935 ms, sys: 508 ms, total: 1.44 s
Wall time: 53.3 s


customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
str,f64,f64,str,str,i64,str
"""00000dbacae5abe5e23885899a1fa4…",,,"""ACTIVE""","""NONE""",49,"""52043ee2162cf5aa7ee79974281641…"
"""0000423b00ade91418cceaf3b26c6a…",,,"""ACTIVE""","""NONE""",25,"""2973abc54daa8a5f8ccfe9362140c6…"
"""000058a12d5b43e67d225668fa1f8d…",,,"""ACTIVE""","""NONE""",24,"""64f17e6a330a85798e4998f62d0930…"


In [17]:
# Check for NaNs
customers_df.null_count()

customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
u32,u32,u32,u32,u32,u32,u32
0,895050,907576,6062,16009,15861,0


## <span style="color:#ff5f27">👨🏻‍🏭 Customers Feature Engineering</span>


In [18]:
customers_df = prepare_customers(customers_df)
customers_df.head(3)

CPU times: user 53.4 ms, sys: 44.9 ms, total: 98.3 ms
Wall time: 26.4 ms


customer_id,club_member_status,age,postal_code,age_group
str,str,f64,str,str
"""00000dbacae5abe5e23885899a1fa4…","""ACTIVE""",49.0,"""52043ee2162cf5aa7ee79974281641…","""46-55"""
"""0000423b00ade91418cceaf3b26c6a…","""ACTIVE""",25.0,"""2973abc54daa8a5f8ccfe9362140c6…","""19-25"""
"""000058a12d5b43e67d225668fa1f8d…","""ACTIVE""",24.0,"""64f17e6a330a85798e4998f62d0930…","""19-25"""


In [19]:
# Consider only customers with age defined
customers_df = customers_df.drop_nulls(subset=["age"])

In [20]:
# Set a seed for reproducibility
random.seed(27)

# Get the number of users based on the flag
N_USERS = DATA_SIZE.get_size()
logger.info(f"Keeping {N_USERS} customers.")

# Sample N_USERS from the DataFrame
customer_subset_df = customers_df.sample(n=N_USERS)

[32m2024-11-08 18:20:27.662[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mKeeping 1000 customers.[0m


---
## <span style="color:#ff5f27">🗄️ Read Transactions Data</span>

In [21]:
trans_df = raw_data.extract_transactions_df()
logger.info(trans_df.shape)
trans_df.head(3)

## <span style="color:#ff5f27">👨🏻‍🏭 Transactions Feature Engineering</span>

The time of the year a purchase was made should be a strong predictor, as seasonality plays a big factor in fashion purchases. Here, you will use the month of the purchase as a feature. Since this is a cyclical feature (January is as close to December as it is to February), you'll map each month to the unit circle using sine and cosine.

In [22]:
trans_df = prepare_transactions(trans_df)
trans_df.head(3)

CPU times: user 3.36 s, sys: 339 ms, total: 3.7 s
Wall time: 2.83 s


t_dat,customer_id,article_id,price,sales_channel_id,year,month,day,day_of_week,month_sin,month_cos
i64,str,str,f64,i64,i32,i8,i8,i8,f64,f64
1537401600000,"""000058a12d5b43e67d225668fa1f8d…","""663713001""",0.050831,2,2018,9,20,4,-1.0,-1.837e-16
1537401600000,"""000058a12d5b43e67d225668fa1f8d…","""541518023""",0.030492,2,2018,9,20,4,-1.0,-1.837e-16
1537401600000,"""00007d2de826758b65a93dd24ce629…","""505221004""",0.015237,2,2018,9,20,4,-1.0,-1.837e-16


In [23]:
logger.info(
    f"There are **{trans_df.height:,}** transactions in total made by all the customers."
)
trans_df = trans_df.join(customer_subset_df.select("customer_id"), on="customer_id")
logger.info(
    f"There are **{len(trans_df):,}** transactions made by the remaining customers."
)

⛳️ There are 31,788,324 transactions in total made by all the users.
⛳️ The subset based on the remaining users has 122,917 transactions in total.


---

## <span style="color:#ff5f27">🤳🏻 Interaction Data</span>


In [24]:
interaction_df = generate_interaction_data(trans_df)

logger.info(interaction_df.shape)
interaction_df.head()

Processing customer chunks: 100%|██████████| 5/5 [00:50<00:00, 10.04s/it]


✅ Done!
(690014, 5)
CPU times: user 48.6 s, sys: 7.58 s, total: 56.1 s
Wall time: 50.8 s


t_dat,customer_id,article_id,interaction_score,prev_article_id
i64,str,str,i64,str
1544641200000,"""00134024c835e60cb90ce17645df8f…","""823505002""",0,"""START"""
1544655600000,"""00134024c835e60cb90ce17645df8f…","""777756002""",0,"""823505002"""
1544659200000,"""00134024c835e60cb90ce17645df8f…","""777756002""",0,"""777756002"""
1544662800000,"""00134024c835e60cb90ce17645df8f…","""823505002""",0,"""777756002"""
1544673600000,"""00134024c835e60cb90ce17645df8f…","""778187001""",0,"""823505002"""


In [25]:
interaction_df.group_by("interaction_score").agg(
    pl.count("interaction_score").alias("total_interactions")
)

interaction_score,total_interactions
i64,u32
0,370193
1,196904
2,122917


---
## <span style="color:#ff5f27">⚙️ To Pandas </span>

We have to map all the DataFrames from Polars to Pandas to make them compatible with Hopsworks.


In [26]:
customers_df = customers_df.to_pandas()  # arrow = True
articles_df = articles_df.to_pandas()
trans_df = trans_df.to_pandas()
interaction_df = interaction_df.to_pandas()

CPU times: user 420 ms, sys: 175 ms, total: 595 ms
Wall time: 630 ms


---

## <span style="color:#ff5f27">🪄 Create Hopsworks Feature Groups </span>

A [feature group](https://docs.hopsworks.ai/feature-store-api/latest/generated/feature_group/) can be seen as a collection of conceptually related features.

Before you can create a feature group you need to connect to your feature store.

To create a feature group you need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group.

#### <span style="color:#ff5f27"> Customers </span>

In [27]:
customers_fg = fs.get_or_create_feature_group(
    name="customers",
    description="Customers data including age and postal code",
    version=1,
    primary_key=["customer_id"],
    online_enabled=True,
)

Here you have also set `online_enabled=True`, which enables low latency access to the data. A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).

At this point, you have only specified some metadata for the feature group. It does not store any data or even have a schema defined for the data. To make the feature group persistent you populate it with its associated data using the `insert` method.

In [28]:
customers_fg.insert(customers_df)
logger.info("✅ Done!")

Uploading Dataframe: 0.00% |          | Rows 0/1356119 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: customers_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/15551/jobs/named/customers_1_offline_fg_materialization/executions
✅ Done!


In [29]:
feature_descriptions = [
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {
        "name": "club_member_status",
        "description": "Membership status of the customer in the club.",
    },
    {"name": "age", "description": "Age of the customer."},
    {
        "name": "postal_code",
        "description": "Postal code associated with the customer's address.",
    },
    {"name": "age_group", "description": "Categorized age group of the customer."},
]

for desc in feature_descriptions:
    customers_fg.update_feature_description(desc["name"], desc["description"])

#### <span style="color:#ff5f27"> Articles </span>

Let's do the same thing for the rest of the data frames.

In [30]:
from hsfs.feature import Feature

article_features = [
    Feature(
        name="article_id", type="string", description="Identifier for the article."
    ),
    Feature(
        name="product_code",
        type="bigint",
        description="Code associated with the product.",
    ),
    Feature(name="prod_name", type="string", description="Name of the product."),
    Feature(
        name="product_type_no",
        type="bigint",
        description="Number associated with the product type.",
    ),
    Feature(
        name="product_type_name", type="string", description="Name of the product type."
    ),
    Feature(
        name="product_group_name",
        type="string",
        description="Name of the product group.",
    ),
    Feature(
        name="graphical_appearance_no",
        type="bigint",
        description="Number associated with graphical appearance.",
    ),
    Feature(
        name="graphical_appearance_name",
        type="string",
        description="Name of the graphical appearance.",
    ),
    Feature(
        name="colour_group_code",
        type="bigint",
        description="Code associated with the colour group.",
    ),
    Feature(
        name="colour_group_name", type="string", description="Name of the colour group."
    ),
    Feature(
        name="perceived_colour_value_id",
        type="bigint",
        description="ID associated with perceived colour value.",
    ),
    Feature(
        name="perceived_colour_value_name",
        type="string",
        description="Name of the perceived colour value.",
    ),
    Feature(
        name="perceived_colour_master_id",
        type="bigint",
        description="ID associated with perceived colour master.",
    ),
    Feature(
        name="perceived_colour_master_name",
        type="string",
        description="Name of the perceived colour master.",
    ),
    Feature(
        name="department_no",
        type="bigint",
        description="Number associated with the department.",
    ),
    Feature(
        name="department_name", type="string", description="Name of the department."
    ),
    Feature(
        name="index_code", type="string", description="Code associated with the index."
    ),
    Feature(name="index_name", type="string", description="Name of the index."),
    Feature(
        name="index_group_no",
        type="bigint",
        description="Number associated with the index group.",
    ),
    Feature(
        name="index_group_name", type="string", description="Name of the index group."
    ),
    Feature(
        name="section_no",
        type="bigint",
        description="Number associated with the section.",
    ),
    Feature(name="section_name", type="string", description="Name of the section."),
    Feature(
        name="garment_group_no",
        type="bigint",
        description="Number associated with the garment group.",
    ),
    Feature(
        name="garment_group_name",
        type="string",
        description="Name of the garment group.",
    ),
    Feature(
        name="prod_name_length",
        type="bigint",
        description="Length of the product name.",
    ),
    Feature(
        name="article_description",
        type="string",
        online_type="VARCHAR(5800)",
        description="Description of the article.",
    ),
    Feature(
        name="embeddings",
        type="array<double>",
        description="Vector embeddings of the article description.",
    ),
    Feature(name="image_url", type="string", description="URL of the product image."),
]

In [31]:
from hsfs import embedding

# Create the Embedding Index
emb = embedding.EmbeddingIndex()

emb.add_embedding(
    "embeddings",
    model.get_sentence_embedding_dimension(),
)

In [31]:
articles_fg = fs.get_or_create_feature_group(
    name="articles",
    version=1,
    description="Fashion items data including type of item, visual description and category",
    primary_key=["article_id"],
    online_enabled=True,
    features=article_features,
    embedding_index=emb,
)
articles_fg.insert(articles_df)
logger.info("✅ Done!")

Uploading Dataframe: 0.00% |          | Rows 0/105542 | Elapsed Time: 00:00 | Remaining Time: ?

#### <span style="color:#ff5f27"> Transactions </span>

In [48]:
trans_fg = fs.get_or_create_feature_group(
    name="transactions",
    version=1,
    description="Transactions data including customer, item, price, sales channel and transaction date",
    primary_key=["customer_id", "article_id"],
    online_enabled=True,
    event_time="t_dat",
)
trans_fg.insert(trans_df)
logger.info("✅ Done!")

Uploading Dataframe: 0.00% |          | Rows 0/122917 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: transactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/15551/jobs/named/transactions_1_offline_fg_materialization/executions
✅ Done!


In [49]:
feature_descriptions = [
    {"name": "t_dat", "description": "Timestamp of the data record."},
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the purchased article."},
    {"name": "price", "description": "Price of the purchased article."},
    {"name": "sales_channel_id", "description": "Identifier for the sales channel."},
    {"name": "year", "description": "Year of the transaction."},
    {"name": "month", "description": "Month of the transaction."},
    {"name": "day", "description": "Day of the transaction."},
    {"name": "day_of_week", "description": "Day of the week of the transaction."},
    {
        "name": "month_sin",
        "description": "Sine of the month used for seasonal patterns.",
    },
    {
        "name": "month_cos",
        "description": "Cosine of the month used for seasonal patterns.",
    },
]

for desc in feature_descriptions:
    trans_fg.update_feature_description(desc["name"], desc["description"])

#### <span style="color:#ff5f27"> Interactions </span>

In [50]:
# Create Interactions Feature Group
interactions_fg = fs.get_or_create_feature_group(
    name="interactions",
    version=1,
    description="Customer interactions with articles including purchases, clicks, and ignores. Used for building recommendation systems and analyzing user behavior.",
    primary_key=["customer_id", "article_id"],
    online_enabled=True,
    event_time="t_dat",
)

# Insert the data
interactions_fg.insert(
    interaction_df,
    write_options={"wait_for_job": True},
)
logger.info("✅ Done!")

Uploading Dataframe: 0.00% |          | Rows 0/687904 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: interactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/15551/jobs/named/interactions_1_offline_fg_materialization/executions
✅ Done!


In [51]:
# Define feature descriptions for interactions
feature_descriptions = [
    {"name": "t_dat", "description": "Timestamp of the interaction."},
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {
        "name": "article_id",
        "description": "Identifier for the article that was interacted with.",
    },
    {
        "name": "interaction_score",
        "description": "Type of interaction: 0 = ignore, 1 = click, 2 = purchase.",
    },
    {
        "name": "prev_article_id",
        "description": "Previous article that the customer interacted with, useful for sequential recommendation patterns.",
    },
]

# Update feature descriptions
for desc in feature_descriptions:
    interactions_fg.update_feature_description(desc["name"], desc["description"])

## <span style="color:#ff5f27">📊 Ranking Dataset </span>


In [53]:
ranking_df = compute_ranking_dataset(
    trans_fg,
    articles_fg,
    customers_fg,
)
ranking_df.head(3)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.14s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (49.28s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (18.76s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (75.59s) 


Unnamed: 0,customer_id,age,month_sin,month_cos,article_id,label,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name
0,3d5e6086e3ee56819118531a4a2cadcdaf5fc84448861b...,26.0,-0.5,-0.8660254,684341002,1,Swimwear bottom,Swimwear,Solid,Dark Red,Dark,Red,Swimwear,Lingeries/Tights,Ladieswear,"Womens Swimwear, beachwear",Swimwear
1,89ab00da5e571760615e7d9941bf976cb08f4048d44ff4...,31.0,-1.0,-1.83697e-16,684341002,1,Swimwear bottom,Swimwear,Solid,Dark Red,Dark,Red,Swimwear,Lingeries/Tights,Ladieswear,"Womens Swimwear, beachwear",Swimwear
2,68cc05ec0de8b8b08c76ee83cbe1d5f1286544d711462f...,24.0,-0.5,-0.8660254,684341002,1,Swimwear bottom,Swimwear,Solid,Dark Red,Dark,Red,Swimwear,Lingeries/Tights,Ladieswear,"Womens Swimwear, beachwear",Swimwear


In [55]:
ranking_df.label.value_counts()

label
0    1103050
1     110305
Name: count, dtype: int64

In [56]:
rank_fg = fs.get_or_create_feature_group(
    name="ranking",
    version=1,
    description="Derived feature group for ranking",
    primary_key=["customer_id", "article_id"],
    parents=[articles_fg, customers_fg, trans_fg],
)
rank_fg.insert(ranking_df)
logger.info("✅ Done!")

Uploading Dataframe: 0.00% |          | Rows 0/1213355 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: ranking_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/15551/jobs/named/ranking_1_offline_fg_materialization/executions
✅ Done!


In [57]:
feature_descriptions = [
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the purchased article."},
    {"name": "age", "description": "Age of the customer."},
    {
        "name": "month_sin",
        "description": "Sine of the month used for seasonal patterns.",
    },
    {
        "name": "month_cos",
        "description": "Cosine of the month used for seasonal patterns.",
    },
    {"name": "product_type_name", "description": "Name of the product type."},
    {"name": "product_group_name", "description": "Name of the product group."},
    {
        "name": "graphical_appearance_name",
        "description": "Name of the graphical appearance.",
    },
    {"name": "colour_group_name", "description": "Name of the colour group."},
    {
        "name": "perceived_colour_value_name",
        "description": "Name of the perceived colour value.",
    },
    {
        "name": "perceived_colour_master_name",
        "description": "Name of the perceived colour master.",
    },
    {"name": "department_name", "description": "Name of the department."},
    {"name": "index_name", "description": "Name of the index."},
    {"name": "index_group_name", "description": "Name of the index group."},
    {"name": "section_name", "description": "Name of the section."},
    {"name": "garment_group_name", "description": "Name of the garment group."},
    {
        "name": "label",
        "description": "Label indicating whether the article was purchased (1) or not (0).",
    },
]

for desc in feature_descriptions:
    rank_fg.update_feature_description(desc["name"], desc["description"])

You should now be able to inspect the feature groups in the Hopsworks UI.

---

In [58]:
# End the timer
notebook_end_time = time.time()

# Calculate and logger.info the execution time
notebook_execution_time = notebook_end_time - notebook_start_time
logger.info(f"⌛️ Notebook Execution time: {notebook_execution_time:.2f} seconds")

⌛️ Notebook Execution time: 1974.10 seconds


---
## <span style="color:#ff5f27">⏩️ Next Steps </span>
In the next notebook you'll train a retrieval model.