In [3]:
import time

notebook_start_time = time.time()

# Set up environment

In [4]:
import sys
from pathlib import Path


def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False


def clone_repository() -> None:
    !git clone https://github.com/chinadupaya/art-recommendations.git
    %cd art-recommendations/


def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

⛳️ Local environment


# 👩🏻‍🔬 Feature pipeline: Computing features

# Imports

In [5]:
%load_ext autoreload
%autoreload 2
# %reload_ext autoreload

import warnings
from pprint import pprint

import polars as pl
import torch
from loguru import logger
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

from recsys import hopsworks_integration
from recsys.config import settings
from recsys.features.artworks import (
    compute_features_artworks,
    generate_embeddings_for_dataframe,
)
from recsys.features.users import DatasetSampler, compute_features_users
from recsys.features.interaction import generate_interaction_data
from recsys.features.ranking import compute_ranking_dataset
from recsys.features.ranking3 import compute_ranking_dataset2
from recsys.features.transactions import compute_features_transactions
from recsys.hopsworks_integration import feature_store

  from .autonotebook import tqdm as notebook_tqdm


# Constants

The most important one is the dataset size.

Choosing a different dataset size will impact the time it takes to run everything and the quality of the final models. We suggest using a small dataset size when running this the first time.

Suported user dataset sizes:

In [6]:
DatasetSampler.get_supported_sizes()

{<UserDatasetSize.LARGE: 'LARGE'>: 50000,
 <UserDatasetSize.MEDIUM: 'MEDIUM'>: 5000,
 <UserDatasetSize.SMALL: 'SMALL'>: 1000}

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [7]:
project, fs = hopsworks_integration.get_feature_store()

[32m2024-12-29 21:55:31.287[0m | [1mINFO    [0m | [36mrecsys.hopsworks_integration.feature_store[0m:[36mget_feature_store[0m:[36m15[0m - [1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.[0m


2024-12-29 21:55:31,287 INFO: Initializing external client
2024-12-29 21:55:31,288 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-29 21:55:32,506 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1197208


In [8]:
artworks_df =  pl.read_csv("../data/artworks_info.csv")
artworks_df.shape

(18538, 8)

The artworks DataFrame looks as follows

In [9]:
artworks_df.head(3)

id,title,category,thumbnail_link,artists_link,genes_link,similar_link,description
str,str,str,str,str,str,str,str
"""4d8b92eb4eb68a1b2c000968""","""Der Kuss (The Kiss)""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image is rich in golds and…"
"""4d8b92ee4eb68a1b2c0009ab""","""The Third of May""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image features a somber pa…"
"""4d8b93394eb68a1b2c0010fa""","""The Company of Frans Banning C…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image features a rich pale…"


In [10]:
artworks_df.null_count()

id,title,category,thumbnail_link,artists_link,genes_link,similar_link,description
u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0


## Artworks Feature Engineering

In [11]:
artworks_df = compute_features_artworks(artworks_df)
artworks_df.shape


(18538, 5)

In [12]:
artworks_df.head()

artwork_id,title,category,thumbnail_link,description
str,str,str,str,str
"""4d8b92eb4eb68a1b2c000968""","""Der Kuss (The Kiss)""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image is rich in golds and…"
"""4d8b92ee4eb68a1b2c0009ab""","""The Third of May""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a somber pa…"
"""4d8b93394eb68a1b2c0010fa""","""The Company of Frans Banning C…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a rich pale…"
"""4d8b937c4eb68a1b2c001722""","""Mona Lisa""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image is characterized by …"
"""4d8b93b04eb68a1b2c001b9d""","""Luncheon on the Grass (Le Déje…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a palette o…"


### Create embeddings from the articles description

In [13]:
for i, desc in enumerate(artworks_df["description"].head(n=3)):
    logger.info(f"Item {i+1}:\n{desc}")

[32m2024-12-29 21:55:33.926[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 1:
The image is rich in golds and soft pastels, creating a warm, harmonious atmosphere. The mood is intimate and tender, evoking feelings of love and closeness. The aesthetic features intricate patterns and floral motifs, reminiscent of the Art Nouveau style. Overall, it radiates a sense of tranquility and beauty.[0m
[32m2024-12-29 21:55:33.926[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 2:
The image features a somber palette dominated by dark tones and stark contrasts, highlighting the brutality of the scene. The mood conveys a sense of despair and urgency, evoking empathy for the victims. The dramatic lighting, particularly the lantern's glow, creates a tension between hope and tragedy. Overall, the aesthetic is visceral and haunting, capturing the raw emotion of human conflict.[0m
[32m2024-12-29 21:55:33.926[0m | [1mINFO    [0m

In [14]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
logger.info(
    f"Loading '{settings.FEATURES_EMBEDDING_MODEL_ID}' embedding model to {device=}"
)

# Load the embedding model from SentenceTransformer's model registry.
model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)

[32m2024-12-29 21:55:33.968[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mLoading 'all-MiniLM-L6-v2' embedding model to device='mps'[0m


2024-12-29 21:55:33,968 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [15]:
embedding = generate_embeddings_for_dataframe(
    artworks_df.head(1), "description", model, batch_size=128
)['embeddings']
embedding

Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  8.57it/s]


embeddings
list[f64]
"[-0.008282, 0.030209, … -0.003906]"


In [16]:
type(artworks_df)

polars.dataframe.frame.DataFrame

In [17]:
artworks_df = artworks_df.with_columns(
    pl.lit(embedding[0].to_list()).alias("embeddings")
)
artworks_df.head(3)

artwork_id,title,category,thumbnail_link,description,embeddings
str,str,str,str,str,list[f64]
"""4d8b92eb4eb68a1b2c000968""","""Der Kuss (The Kiss)""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image is rich in golds and…","[-0.008282, 0.030209, … -0.003906]"
"""4d8b92ee4eb68a1b2c0009ab""","""The Third of May""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a somber pa…","[-0.008282, 0.030209, … -0.003906]"
"""4d8b93394eb68a1b2c0010fa""","""The Company of Frans Banning C…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a rich pale…","[-0.008282, 0.030209, … -0.003906]"


In [18]:
artworks_df = generate_embeddings_for_dataframe(
    artworks_df, "description", model, batch_size=128
)  # Reduce batch size if getting OOM errors.

Generating embeddings: 100%|██████████| 18538/18538 [01:30<00:00, 205.84it/s]


For each artwork description, we have a numerical vector which we can feed to a model, opposite to a string containing the description of an object.

In [19]:
artworks_df[["description", "embeddings"]].head(3)

description,embeddings
str,list[f64]
"""The image is rich in golds and…","[-0.008282, 0.030209, … -0.003906]"
"""The image features a somber pa…","[0.058479, 0.083046, … 0.008479]"
"""The image features a rich pale…","[0.012057, 0.037521, … 0.004547]"


## Looking at image links

In [20]:
artworks_df["thumbnail_link"][0]

'https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa-3r51Cg9qXKbfA/medium.jpg'

In [21]:
from IPython.display import HTML, display

image_urls = artworks_df["thumbnail_link"].tail(12).to_list()
grid_html = '<div style="display: grid; grid-template-columns: repeat(6, 1fr); gap: 10px; max-width: 900px;">'

for url in image_urls:
    grid_html += f'<img src="{url}" style="width: 100%; height: auto;">'

grid_html += "</div>"

display(HTML(grid_html))


# Users Data

In [22]:
users_df = pl.read_csv("../data/updated_user_details.csv")
users_df.shape


(54000, 5)

The users DataFrame looks as follows:

In [23]:
users_df.head(3)

user_id,literal,age,gender,preference
str,str,i64,str,str
"""69ff63b1-4803-49c9-8585-3b0f47…","""[Abbaye du Gard pres d' Abbevi…",42,"""Male""","""Impressionism"""
"""81cd5866-14fb-4968-80f2-accf84…","""[Italian Landscape, Sunset (So…",48,"""Female""","""Impressionism"""
"""83e00911-6b5a-44d3-9edb-3c02a2…","""[The Church of Souain, The Sei…",56,"""Female""","""Impressionism"""


Check for nulls

In [24]:
users_df.null_count()

user_id,literal,age,gender,preference
u32,u32,u32,u32,u32
0,0,0,0,0


# Users feature engineering

In [25]:
users_df = compute_features_users(users_df, drop_null_age=True)
users_df.shape

(54000, 6)

# Transactions Data
These are the artworks a user likes

In [26]:
transactions_df = pl.read_csv("../data/transaction-data.csv")
transactions_df.shape

(306100, 4)

Transactions DataFrame looks as follows

In [27]:
transactions_df.head(3)

transaction_id,user_id,artwork_id,thumbnail_link
str,str,str,str
"""f9f34e02-f51d-42ff-ada2-928728…","""636d1fa7-a3c2-4fe6-b278-265d29…","""516cbb4a0f8b7853440003fe""","""https://d32dm0rphc51dk.cloudfr…"
"""7f8ba386-bfe0-4c0a-b71e-c45c9b…","""8cea0e10-c960-44e9-b404-bc3047…","""516cbfd89ad2d38886000142""","""https://d32dm0rphc51dk.cloudfr…"
"""a2f1abf6-b58b-4f4a-a0ba-ee2fba…","""5ab9bc35-c1e5-4d99-bddd-499653…","""515b45c0223afa29bd000948""","""https://d32dm0rphc51dk.cloudfr…"


In [28]:
from datetime import datetime
t_dat = datetime.today()
# t_dat = datetime.today().strftime('%Y-%m-%d')
transactions_df = (
    transactions_df.with_columns(t_dat=pl.lit(t_dat))
)

transactions_df.head(3)

transaction_id,user_id,artwork_id,thumbnail_link,t_dat
str,str,str,str,datetime[μs]
"""f9f34e02-f51d-42ff-ada2-928728…","""636d1fa7-a3c2-4fe6-b278-265d29…","""516cbb4a0f8b7853440003fe""","""https://d32dm0rphc51dk.cloudfr…",2024-12-29 21:57:07.340762
"""7f8ba386-bfe0-4c0a-b71e-c45c9b…","""8cea0e10-c960-44e9-b404-bc3047…","""516cbfd89ad2d38886000142""","""https://d32dm0rphc51dk.cloudfr…",2024-12-29 21:57:07.340762
"""a2f1abf6-b58b-4f4a-a0ba-ee2fba…","""5ab9bc35-c1e5-4d99-bddd-499653…","""515b45c0223afa29bd000948""","""https://d32dm0rphc51dk.cloudfr…",2024-12-29 21:57:07.340762


## Remove transactions that aren't in the categories of filtered artworks

In [29]:
transactions_df = transactions_df.join(artworks_df, on="artwork_id", how="semi")
transactions_df.shape

(275417, 5)

# Transactions Feature Engineering

In [30]:
transactions_df = compute_features_transactions(transactions_df)
transactions_df.shape

(275417, 9)

In [31]:
transactions_df.head(3)

transaction_id,user_id,artwork_id,thumbnail_link,t_dat,year,month,day,day_of_week
str,str,str,str,i64,i32,i8,i8,i8
"""f9f34e02-f51d-42ff-ada2-928728…","""636d1fa7-a3c2-4fe6-b278-265d29…","""516cbb4a0f8b7853440003fe""","""https://d32dm0rphc51dk.cloudfr…",1735509427,2024,12,29,7
"""7f8ba386-bfe0-4c0a-b71e-c45c9b…","""8cea0e10-c960-44e9-b404-bc3047…","""516cbfd89ad2d38886000142""","""https://d32dm0rphc51dk.cloudfr…",1735509427,2024,12,29,7
"""a2f1abf6-b58b-4f4a-a0ba-ee2fba…","""5ab9bc35-c1e5-4d99-bddd-499653…","""515b45c0223afa29bd000948""","""https://d32dm0rphc51dk.cloudfr…",1735509427,2024,12,29,7


We don't want to work with ~300k transactions in these series, as everything will take too much time to run. Thus, we create a subset of the original dataset by randomly sampling from the users' datasets and taking only their transactions.

In [32]:
sampler = DatasetSampler(size=settings.USER_DATA_SIZE)
dataset_subset = sampler.sample(
    users_df=users_df, transations_df=transactions_df
)
users_df = dataset_subset["users"]
transactions_df = dataset_subset["transactions"]

[32m2024-12-29 21:57:07.519[0m | [1mINFO    [0m | [36mrecsys.features.users[0m:[36msample[0m:[36m29[0m - [1mSampling 1000 users.[0m
[32m2024-12-29 21:57:07.520[0m | [1mINFO    [0m | [36mrecsys.features.users[0m:[36msample[0m:[36m32[0m - [1mNumber of transactions for all the users: 275417[0m
[32m2024-12-29 21:57:07.526[0m | [1mINFO    [0m | [36mrecsys.features.users[0m:[36msample[0m:[36m38[0m - [1mNumber of transactions for the 1000 sampled users: 5034[0m


In [33]:
transactions_df.shape

(5034, 9)

Some of the remaining customers

In [34]:
for user_id in transactions_df["user_id"].unique().head(10):
    logger.info(f"Logging user ID: {user_id}")

[32m2024-12-29 21:57:07.586[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: 0e0cb36e-8d41-44ce-a0bf-1b872e602ac9[0m
[32m2024-12-29 21:57:07.586[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: 1f8b4af1-6665-4f31-8194-bbff2b6a9cb2[0m
[32m2024-12-29 21:57:07.586[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: 099c9447-7381-479c-a45a-e8e177075647[0m
[32m2024-12-29 21:57:07.586[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: e09da647-c6c0-4a6c-a5b2-60b2e0bbd52f[0m
[32m2024-12-29 21:57:07.586[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: 40162922-8440-4d0a-9f3f-559474990c27[0m
[32m2024-12-29 21:57:07.587[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: 13048e58-6c65-4d3b-a4c0-b75ed67724a4[0m
[32m2024-

# 🤳🏻 Interaction data

To train our models, we need more than just the transactions DataFrame. We need positive samples that signal whether a user clicked or liked an item, but we also need negative samples that signal no interactions between a user and an artwork.

In [35]:
interaction_df = generate_interaction_data(transactions_df)
interaction_df.shape

Processing user chunks: 100%|██████████| 1/1 [00:00<00:00,  1.14it/s]


(92477, 5)

The interaction features look as follows:

In [36]:
interaction_df.head()

t_dat,user_id,artwork_id,interaction_score,prev_artwork_id
i64,str,str,i64,str
1379109427,"""00306dfa-3c17-4709-b9f4-834502…","""516df4bcb31e2bd65e000b4d""",0,"""START"""
1386309427,"""00306dfa-3c17-4709-b9f4-834502…","""515b4b7138ad2d25a7001489""",0,"""516df4bcb31e2bd65e000b4d"""
1397109427,"""00306dfa-3c17-4709-b9f4-834502…","""515b4b7138ad2d25a7001489""",0,"""515b4b7138ad2d25a7001489"""
1407909427,"""00306dfa-3c17-4709-b9f4-834502…","""515d6be07696593fde004b65""",0,"""515b4b7138ad2d25a7001489"""
1411509427,"""00306dfa-3c17-4709-b9f4-834502…","""515bb7c01b12b0244a002029""",0,"""515d6be07696593fde004b65"""


Let's take a look at the interaction score distribution

In [None]:
interaction_df.group_by("interaction_score").agg(
    pl.count("interaction_score").alias("total_interactions")
)

Here is what each score means:
- `0` : No interaction between a user and an artwork
- `1` : A user clicked an artwork
- `2` : A cuser liked an item

# <span style="color:#ff5f27">🪄 Create Hopsworks Feature Groups </span>

A [feature group](https://docs.hopsworks.ai/feature-store-api/latest/generated/feature_group/) can be seen as a collection of conceptually related features.

To create a feature group you need to give it a name and specify a primary key. It is also best practice to provide a description of the contents of the feature group.

#### Users

We set `online_enabled=True` to enable low-latency access to the data from the inference pipeline for real-time predictions. 

A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).

In [38]:
logger.info("Uploading 'users' Feature Group to Hopsworks.")
users_fg = feature_store.create_users_feature_group(
    fs, df=users_df, online_enabled=True
)

logger.info("✅ Uploaded 'users' Feature Group to Hopsworks!")

[32m2024-12-29 22:02:35.186[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'users' Feature Group to Hopsworks.[0m


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1197208/fs/1186887/fg/1393360


Uploading Dataframe: 100.00% |██████████| Rows 1000/1000 | Elapsed Time: 00:03 | Remaining Time: 00:00


Launching job: users_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1197208/jobs/named/users_1_offline_fg_materialization/executions
2024-12-29 22:02:55,671 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-12-29 22:02:58,824 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-12-29 22:04:20,890 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-12-29 22:04:21,030 INFO: Waiting for log aggregation to finish.
2024-12-29 22:04:39,452 INFO: Execution finished successfully.


[32m2024-12-29 22:04:42.308[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1m✅ Uploaded 'users' Feature Group to Hopsworks![0m


#### Artworks

Let's do the same thing for the rest of the data frames

In [39]:
logger.info("Uploading 'artworks' Feature Group to Hopsworks.")
artworks_fg = feature_store.create_artworks_feature_group(
    fs,
    df=artworks_df,
    artworks_description_embedding_dim=model.get_sentence_embedding_dimension(),
    online_enabled=True,
)
logger.info("✅ Uploaded 'artworks' Feature Group to Hopsworks!")


[32m2024-12-29 22:05:03.973[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'artworks' Feature Group to Hopsworks.[0m


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1197208/fs/1186887/fg/1394346


Uploading Dataframe: 100.00% |██████████| Rows 18538/18538 | Elapsed Time: 01:00 | Remaining Time: 00:00


Launching job: artworks_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1197208/jobs/named/artworks_1_offline_fg_materialization/executions
2024-12-29 22:06:20,734 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-12-29 22:06:23,903 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-12-29 22:08:08,494 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-12-29 22:08:08,632 INFO: Waiting for log aggregation to finish.
2024-12-29 22:08:17,138 INFO: Execution finished successfully.


[32m2024-12-29 22:08:17.140[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1m✅ Uploaded 'artworks' Feature Group to Hopsworks![0m


#### Transactions

In [40]:
logger.info("Uploading 'transactions' Feature Group to Hopsworks.")
trans_fg = feature_store.create_transactions_feature_group(
    fs=fs, df=transactions_df, online_enabled=True
)
logger.info("✅ Uploaded 'transactions' Feature Group to Hopsworks!")

[32m2024-12-29 22:08:22.410[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'transactions' Feature Group to Hopsworks.[0m


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1197208/fs/1186887/fg/1393361


Uploading Dataframe: 100.00% |██████████| Rows 5034/5034 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: transactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1197208/jobs/named/transactions_1_offline_fg_materialization/executions
2024-12-29 22:08:47,268 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-12-29 22:08:50,432 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-12-29 22:10:18,831 INFO: Waiting for execution to finish. Current state: SUCCEEDING. Final status: UNDEFINED
2024-12-29 22:10:21,990 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-12-29 22:10:22,144 INFO: Waiting for log aggregation to finish.
2024-12-29 22:10:30,858 INFO: Execution finished successfully.


[32m2024-12-29 22:10:35.246[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1m✅ Uploaded 'transactions' Feature Group to Hopsworks![0m


#### Interactions

In [41]:
logger.info("Uploading 'interactions' Feature Group to Hopsworks.")
interactions_fg = feature_store.create_interactions_feature_group(
    fs=fs, df=interaction_df, online_enabled=True
)
logger.info("✅ Uploaded 'interactions' Feature Group to Hopsworks!!")

[32m2024-12-29 22:10:41.323[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'interactions' Feature Group to Hopsworks.[0m


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1197208/fs/1186887/fg/1394347


Uploading Dataframe: 100.00% |██████████| Rows 92477/92477 | Elapsed Time: 00:16 | Remaining Time: 00:00


Launching job: interactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1197208/jobs/named/interactions_1_offline_fg_materialization/executions
2024-12-29 22:11:13,927 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-12-29 22:11:17,078 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-12-29 22:13:01,177 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-12-29 22:13:01,318 INFO: Waiting for log aggregation to finish.
2024-12-29 22:13:09,842 INFO: Execution finished successfully.


[32m2024-12-29 22:13:13.007[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1m✅ Uploaded 'interactions' Feature Group to Hopsworks!![0m


In [None]:
# import hopsworks 
# project = hopsworks.login(api_key_value="H")
# fs = project.get_feature_store(name='id2223artsy_featurestore')
# artworks_fg = fs.get_feature_group('artworks', version=1)

# Compute ranking dataset

The last step is to compute the ranking dataset used to train the scoring/ranking model from the feature groups we've just created:


In [42]:
ranking_df = compute_ranking_dataset(
    trans_fg,
    artworks_fg,
    users_fg,
)
ranking_df.shape

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.04s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.97s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.52s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (22.93s) 


(55352, 7)

The ranking dataset looks as follows:

In [None]:
# ranking_df = compute_ranking_dataset2(
#     transactions_df,
#     artworks_df,
#     users_df,
# )
# ranking_df.shape

In [43]:
ranking_df.head(3)

user_id,age,artwork_id,label,title,description,category
str,f64,str,i32,str,str,str
"""a623decb-2498-478b-bb8d-e55d75…",55.0,"""515b146338ad2d78ca00081c""",1,"""Afternoon Tea Party""","""The image features a palette o…","""Print"""
"""e69837e2-b329-4fbb-88d6-02074d…",53.0,"""515cdd2a5eeb1c904c000d57""",1,"""La Chapelle Saint-Michel à l'E…","""The image features a muted col…","""Print"""
"""ac49aaea-ef6b-48c9-b777-bf8174…",32.0,"""5034f0e30726aa00020008ac""",1,"""Joseph of Arimathea Preaching …","""The image features a palette o…","""Print"""


In [44]:
ranking_df.get_column("label").value_counts()

label,count
i32,u32
0,50320
1,5032


In [45]:
logger.info("Uploading 'ranking' Feature Group to Hopsworks.")
rank_fg = feature_store.create_ranking_feature_group(
    fs,
    df=ranking_df,
    parents=[artworks_fg, users_fg, trans_fg],
    online_enabled=False
)
logger.info("✅ Uploaded 'ranking' Feature Group to Hopsworks!!")

[32m2024-12-29 22:14:26.288[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'ranking' Feature Group to Hopsworks.[0m


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1197208/fs/1186887/fg/1393362


Uploading Dataframe: 100.00% |██████████| Rows 55352/55352 | Elapsed Time: 00:23 | Remaining Time: 00:00


Launching job: ranking_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1197208/jobs/named/ranking_1_offline_fg_materialization/executions
2024-12-29 22:15:05,953 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-12-29 22:15:09,108 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-12-29 22:16:50,051 INFO: Waiting for execution to finish. Current state: SUCCEEDING. Final status: UNDEFINED
2024-12-29 22:16:53,205 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-12-29 22:16:53,375 INFO: Waiting for log aggregation to finish.
2024-12-29 22:17:11,791 INFO: Execution finished successfully.


[32m2024-12-29 22:17:15.855[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1m✅ Uploaded 'ranking' Feature Group to Hopsworks!![0m
