In [12]:
import time

notebook_start_time = time.time()

# Set up environment

In [14]:
import sys
from pathlib import Path


def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False


def clone_repository() -> None:
    !git clone https://github.com/chinadupaya/art-recommendations.git
    %cd art-recommendations/


def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

⛳️ Local environment


# 👩🏻‍🔬 Feature pipeline: Computing features

# Imports

In [41]:
%load_ext autoreload
%autoreload 2
# %reload_ext autoreload

import warnings
from pprint import pprint

import polars as pl
import torch
from loguru import logger
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

from recsys import hopsworks_integration
from recsys.config import settings
from recsys.features.artworks import (
    compute_features_artworks,
    generate_embeddings_for_dataframe,
)
from recsys.features.users import DatasetSampler, compute_features_users
from recsys.features.interaction import generate_interaction_data
# from recsys.features.ranking import compute_ranking_dataset
from recsys.features.transactions import compute_features_transactions
# from recsys.hopsworks_integration import feature_store
# from recsys.raw_data_sources import h_and_m as h_and_m_raw_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Constants

The most important one is the dataset size.

Choosing a different dataset size will impact the time it takes to run everything and the quality of the final models. We suggest using a small dataset size when running this the first time.

Suported user dataset sizes:

In [16]:
DatasetSampler.get_supported_sizes()

{<UserDatasetSize.LARGE: 'LARGE'>: 50000,
 <UserDatasetSize.MEDIUM: 'MEDIUM'>: 5000,
 <UserDatasetSize.SMALL: 'SMALL'>: 1000}

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [17]:
project, fs = hopsworks_integration.get_feature_store()

[32m2024-12-22 11:45:56.006[0m | [1mINFO    [0m | [36mrecsys.hopsworks_integration.feature_store[0m:[36mget_feature_store[0m:[36m15[0m - [1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.[0m


2024-12-22 11:45:56,007 INFO: Initializing external client
2024-12-22 11:45:56,008 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-22 11:45:58,503 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1197208


In [18]:
artworks_df =  pl.read_csv("../data/artworks_info.csv")
artworks_df.shape

(18538, 8)

The artworks DataFrame looks as follows

In [19]:
artworks_df.head(3)

id,title,category,thumbnail_link,artists_link,genes_link,similar_link,description
str,str,str,str,str,str,str,str
"""4d8b92eb4eb68a1b2c000968""","""Der Kuss (The Kiss)""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image is rich in golds and…"
"""4d8b92ee4eb68a1b2c0009ab""","""The Third of May""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image features a somber pa…"
"""4d8b93394eb68a1b2c0010fa""","""The Company of Frans Banning C…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image features a rich pale…"


In [20]:
artworks_df.null_count()

id,title,category,thumbnail_link,artists_link,genes_link,similar_link,description
u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0


## Artworks Feature Engineering

In [21]:
artworks_df = compute_features_artworks(artworks_df)
artworks_df.shape


(18538, 5)

In [22]:
artworks_df.head()

id,title,category,thumbnail_link,description
str,str,str,str,str
"""4d8b92eb4eb68a1b2c000968""","""Der Kuss (The Kiss)""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image is rich in golds and…"
"""4d8b92ee4eb68a1b2c0009ab""","""The Third of May""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a somber pa…"
"""4d8b93394eb68a1b2c0010fa""","""The Company of Frans Banning C…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a rich pale…"
"""4d8b937c4eb68a1b2c001722""","""Mona Lisa""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image is characterized by …"
"""4d8b93b04eb68a1b2c001b9d""","""Luncheon on the Grass (Le Déje…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a palette o…"


### Create embeddings from the articles description

In [23]:
for i, desc in enumerate(artworks_df["description"].head(n=3)):
    logger.info(f"Item {i+1}:\n{desc}")

[32m2024-12-22 11:46:19.902[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 1:
The image is rich in golds and soft pastels, creating a warm, harmonious atmosphere. The mood is intimate and tender, evoking feelings of love and closeness. The aesthetic features intricate patterns and floral motifs, reminiscent of the Art Nouveau style. Overall, it radiates a sense of tranquility and beauty.[0m
[32m2024-12-22 11:46:19.903[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 2:
The image features a somber palette dominated by dark tones and stark contrasts, highlighting the brutality of the scene. The mood conveys a sense of despair and urgency, evoking empathy for the victims. The dramatic lighting, particularly the lantern's glow, creates a tension between hope and tragedy. Overall, the aesthetic is visceral and haunting, capturing the raw emotion of human conflict.[0m
[32m2024-12-22 11:46:19.903[0m | [1mINFO    [0m

In [109]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
logger.info(
    f"Loading '{settings.FEATURES_EMBEDDING_MODEL_ID}' embedding model to {device=}"
)

# Load the embedding model from SentenceTransformer's model registry.
model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)

[32m2024-12-19 19:44:40.870[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mLoading 'all-MiniLM-L6-v2' embedding model to device='mps'[0m


2024-12-19 19:44:40,871 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [111]:
embedding = generate_embeddings_for_dataframe(
    artworks_df.head(1), "description", model, batch_size=128
)['embeddings']
embedding

Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  6.43it/s]


embeddings
list[f64]
"[-0.008282, 0.030209, … -0.003906]"


In [112]:
type(artworks_df)

polars.dataframe.frame.DataFrame

In [113]:
artworks_df = artworks_df.with_columns(
    pl.lit(embedding[0].to_list()).alias("embeddings")
)
artworks_df.head(3)

id,title,category,thumbnail_link,description,embeddings
str,str,str,str,str,list[f64]
"""4d8b92eb4eb68a1b2c000968""","""Der Kuss (The Kiss)""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image is rich in golds and…","[-0.008282, 0.030209, … -0.003906]"
"""4d8b92ee4eb68a1b2c0009ab""","""The Third of May""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a somber pa…","[-0.008282, 0.030209, … -0.003906]"
"""4d8b93394eb68a1b2c0010fa""","""The Company of Frans Banning C…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a rich pale…","[-0.008282, 0.030209, … -0.003906]"


In [115]:
artworks_df = generate_embeddings_for_dataframe(
    artworks_df, "description", model, batch_size=128
)  # Reduce batch size if getting OOM errors.

Generating embeddings: 100%|██████████| 18538/18538 [01:41<00:00, 182.40it/s]


For each artwork description, we have a numerical vector which we can feed to a model, opposite to a string containing the description of an object.

In [116]:
artworks_df[["description", "embeddings"]].head(3)

description,embeddings
str,list[f64]
"""The image is rich in golds and…","[-0.008282, 0.030209, … -0.003906]"
"""The image features a somber pa…","[0.058479, 0.083046, … 0.008479]"
"""The image features a rich pale…","[0.012057, 0.037521, … 0.004547]"


## Looking at image links

In [24]:
artworks_df["thumbnail_link"][0]

'https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa-3r51Cg9qXKbfA/medium.jpg'

In [118]:
from IPython.display import HTML, display

image_urls = artworks_df["thumbnail_link"].tail(12).to_list()
grid_html = '<div style="display: grid; grid-template-columns: repeat(6, 1fr); gap: 10px; max-width: 900px;">'

for url in image_urls:
    grid_html += f'<img src="{url}" style="width: 100%; height: auto;">'

grid_html += "</div>"

display(HTML(grid_html))


# Users Data

In [25]:
users_df = pl.read_csv("../data/updated_user_details.csv")
users_df.shape


(54000, 5)

The users DataFrame looks as follows:

In [26]:
users_df.head(3)

user_id,literal,age,gender,preference
str,str,i64,str,str
"""69ff63b1-4803-49c9-8585-3b0f47…","""[Abbaye du Gard pres d' Abbevi…",42,"""Male""","""Impressionism"""
"""81cd5866-14fb-4968-80f2-accf84…","""[Italian Landscape, Sunset (So…",48,"""Female""","""Impressionism"""
"""83e00911-6b5a-44d3-9edb-3c02a2…","""[The Church of Souain, The Sei…",56,"""Female""","""Impressionism"""


Check for nulls

In [27]:
users_df.null_count()

user_id,literal,age,gender,preference
u32,u32,u32,u32,u32
0,0,0,0,0


# Users feature engineering

In [30]:
users_df = compute_features_users(users_df, drop_null_age=True)
users_df.shape

(54000, 6)

# Transactions Data
These are the artworks a user likes

In [58]:
transactions_df = pl.read_csv("../data/transaction-data.csv")
transactions_df.shape

(306100, 4)

Transactions DataFrame looks as follows

In [62]:
transactions_df.head(3)

transaction_id,user_id,artwork_id,thumbnail_link,t_dat
str,str,str,str,str
"""f9f34e02-f51d-42ff-ada2-928728…","""636d1fa7-a3c2-4fe6-b278-265d29…","""516cbb4a0f8b7853440003fe""","""https://d32dm0rphc51dk.cloudfr…","""2024-12-22"""
"""7f8ba386-bfe0-4c0a-b71e-c45c9b…","""8cea0e10-c960-44e9-b404-bc3047…","""516cbfd89ad2d38886000142""","""https://d32dm0rphc51dk.cloudfr…","""2024-12-22"""
"""a2f1abf6-b58b-4f4a-a0ba-ee2fba…","""5ab9bc35-c1e5-4d99-bddd-499653…","""515b45c0223afa29bd000948""","""https://d32dm0rphc51dk.cloudfr…","""2024-12-22"""


In [67]:
from datetime import datetime
t_dat = datetime.today()
# t_dat = datetime.today().strftime('%Y-%m-%d')
transactions_df = (
    transactions_df.with_columns(t_dat=pl.lit(t_dat))
)

transactions_df.head(3)

transaction_id,user_id,artwork_id,thumbnail_link,t_dat
str,str,str,str,datetime[μs]
"""f9f34e02-f51d-42ff-ada2-928728…","""636d1fa7-a3c2-4fe6-b278-265d29…","""516cbb4a0f8b7853440003fe""","""https://d32dm0rphc51dk.cloudfr…",2024-12-22 12:47:45.756524
"""7f8ba386-bfe0-4c0a-b71e-c45c9b…","""8cea0e10-c960-44e9-b404-bc3047…","""516cbfd89ad2d38886000142""","""https://d32dm0rphc51dk.cloudfr…",2024-12-22 12:47:45.756524
"""a2f1abf6-b58b-4f4a-a0ba-ee2fba…","""5ab9bc35-c1e5-4d99-bddd-499653…","""515b45c0223afa29bd000948""","""https://d32dm0rphc51dk.cloudfr…",2024-12-22 12:47:45.756524


# Transactions Feature Engineering

In [68]:
transactions_df = compute_features_transactions(transactions_df)
transactions_df.shape

(306100, 9)

In [69]:
transactions_df.head(3)

transaction_id,user_id,artwork_id,thumbnail_link,t_dat,year,month,day,day_of_week
str,str,str,str,i64,i32,i8,i8,i8
"""f9f34e02-f51d-42ff-ada2-928728…","""636d1fa7-a3c2-4fe6-b278-265d29…","""516cbb4a0f8b7853440003fe""","""https://d32dm0rphc51dk.cloudfr…",1734871665,2024,12,22,7
"""7f8ba386-bfe0-4c0a-b71e-c45c9b…","""8cea0e10-c960-44e9-b404-bc3047…","""516cbfd89ad2d38886000142""","""https://d32dm0rphc51dk.cloudfr…",1734871665,2024,12,22,7
"""a2f1abf6-b58b-4f4a-a0ba-ee2fba…","""5ab9bc35-c1e5-4d99-bddd-499653…","""515b45c0223afa29bd000948""","""https://d32dm0rphc51dk.cloudfr…",1734871665,2024,12,22,7


We don't want to work with ~300k transactions in these series, as everything will take too much time to run. Thus, we create a subset of the original dataset by randomly sampling from the users' datasets and taking only their transactions.

In [70]:
sampler = DatasetSampler(size=settings.USER_DATA_SIZE)
dataset_subset = sampler.sample(
    users_df=users_df, transations_df=transactions_df
)
users_df = dataset_subset["users"]
transactions_df = dataset_subset["transactions"]

[32m2024-12-22 12:48:01.195[0m | [1mINFO    [0m | [36mrecsys.features.users[0m:[36msample[0m:[36m29[0m - [1mSampling 1000 users.[0m
[32m2024-12-22 12:48:01.195[0m | [1mINFO    [0m | [36mrecsys.features.users[0m:[36msample[0m:[36m32[0m - [1mNumber of transactions for all the users: 306100[0m
[32m2024-12-22 12:48:01.205[0m | [1mINFO    [0m | [36mrecsys.features.users[0m:[36msample[0m:[36m38[0m - [1mNumber of transactions for the 1000 sampled users: 5606[0m


In [71]:
transactions_df.shape

(5606, 9)

Some of the remaining customers

In [72]:
for user_id in transactions_df["user_id"].unique().head(10):
    logger.info(f"Logging user ID: {user_id}")

[32m2024-12-22 12:48:06.160[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: b9b09efb-6eec-4d0c-b41c-f7a036905d98[0m
[32m2024-12-22 12:48:06.161[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: 45f542e1-a300-4d14-b887-29dae821c598[0m
[32m2024-12-22 12:48:06.161[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: 96692d5b-3b40-428e-a1d2-e2de2df77ba4[0m
[32m2024-12-22 12:48:06.161[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: db7c1b44-6830-4181-91bc-b5bbe7631195[0m
[32m2024-12-22 12:48:06.161[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: 0d172e6c-554e-4226-8900-d95f7056d1ae[0m
[32m2024-12-22 12:48:06.162[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging user ID: e98f53a2-c651-4a5d-9cfc-4bc3da164a90[0m
[32m2024-

# 🤳🏻 Interaction data

To train our models, we need more than just the transactions DataFrame. We need positive samples that signal whether a user clicked or liked an item, but we also need negative samples that signal no interactions between a user and an artwork.

In [73]:
interaction_df = generate_interaction_data(transactions_df)
interaction_df.shape

Processing user chunks: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]


(93801, 5)

The interaction features look as follows:

In [74]:
interaction_df.head()

t_dat,user_id,artwork_id,interaction_score,prev_artwork_id
i64,str,str,i64,str
1382071665,"""00306dfa-3c17-4709-b9f4-834502…","""515d34815eeb1c904c00385b""",0,"""START"""
1385671665,"""00306dfa-3c17-4709-b9f4-834502…","""515d0d29b5907bf7e80024d4""",0,"""515d34815eeb1c904c00385b"""
1385671665,"""00306dfa-3c17-4709-b9f4-834502…","""515bacbe94714c2e3800107c""",0,"""515d0d29b5907bf7e80024d4"""
1428871665,"""00306dfa-3c17-4709-b9f4-834502…","""515b3fe8223afaab8f000eed""",0,"""515bacbe94714c2e3800107c"""
1436071665,"""00306dfa-3c17-4709-b9f4-834502…","""515d24be7696593fde00320c""",0,"""515b3fe8223afaab8f000eed"""


Let's take a look at the interaction score distribution

In [75]:
interaction_df.group_by("interaction_score").agg(
    pl.count("interaction_score").alias("total_interactions")
)

interaction_score,total_interactions
i64,u32
0,74560
1,13635
2,5606


Here is what each score means:
- `0` : No interaction between a user and an artwork
- `1` : A user clicked an artwork
- `2` : A cuser liked an item