In [33]:
import time

notebook_start_time = time.time()

# Set up environment

In [35]:
import sys
from pathlib import Path


def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False


def clone_repository() -> None:
    !git clone https://github.com/chinadupaya/art-recommendations.git
    %cd art-recommendations/


def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

⛳️ Local environment


# 👩🏻‍🔬 Feature pipeline: Computing features

# Imports

In [110]:
%load_ext autoreload
%autoreload 2
# %reload_ext autoreload

import warnings
from pprint import pprint

import polars as pl
import torch
from loguru import logger
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

from recsys import hopsworks_integration
from recsys.config import settings
from recsys.features.artworks import (
    compute_features_artworks,
    generate_embeddings_for_dataframe,
)
from recsys.features.users import DatasetSampler
# from recsys.features.interaction import generate_interaction_data
# from recsys.features.ranking import compute_ranking_dataset
# from recsys.features.transactions import compute_features_transactions
# from recsys.hopsworks_integration import feature_store
# from recsys.raw_data_sources import h_and_m as h_and_m_raw_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Constants

The most important one is the dataset size.

Choosing a different dataset size will impact the time it takes to run everything and the quality of the final models. We suggest using a small dataset size when running this the first time.

Suported user dataset sizes:

In [37]:
DatasetSampler.get_supported_sizes()

{<UserDatasetSize.LARGE: 'LARGE'>: 50000,
 <UserDatasetSize.MEDIUM: 'MEDIUM'>: 5000,
 <UserDatasetSize.SMALL: 'SMALL'>: 1000}

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [39]:
project, fs = hopsworks_integration.get_feature_store()

[32m2024-12-19 18:50:39.458[0m | [1mINFO    [0m | [36mrecsys.hopsworks_integration.feature_store[0m:[36mget_feature_store[0m:[36m16[0m - [1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.[0m


2024-12-19 18:50:39,459 INFO: Initializing external client
2024-12-19 18:50:39,460 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-19 18:50:41,025 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1197208


In [102]:
artworks_df =  pl.read_csv("../data/artworks_info.csv")
artworks_df.shape

(18538, 8)

The artworks DataFrame looks as follows

In [103]:
artworks_df.head(3)

id,title,category,thumbnail_link,artists_link,genes_link,similar_link,description
str,str,str,str,str,str,str,str
"""4d8b92eb4eb68a1b2c000968""","""Der Kuss (The Kiss)""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image is rich in golds and…"
"""4d8b92ee4eb68a1b2c0009ab""","""The Third of May""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image features a somber pa…"
"""4d8b93394eb68a1b2c0010fa""","""The Company of Frans Banning C…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image features a rich pale…"


In [104]:
artworks_df.head()

id,title,category,thumbnail_link,artists_link,genes_link,similar_link,description
str,str,str,str,str,str,str,str
"""4d8b92eb4eb68a1b2c000968""","""Der Kuss (The Kiss)""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image is rich in golds and…"
"""4d8b92ee4eb68a1b2c0009ab""","""The Third of May""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image features a somber pa…"
"""4d8b93394eb68a1b2c0010fa""","""The Company of Frans Banning C…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image features a rich pale…"
"""4d8b937c4eb68a1b2c001722""","""Mona Lisa""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image is characterized by …"
"""4d8b93b04eb68a1b2c001b9d""","""Luncheon on the Grass (Le Déje…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""https://api.artsy.net/api/arti…","""https://api.artsy.net/api/gene…","""https://api.artsy.net/api/artw…","""The image features a palette o…"


In [105]:
artworks_df.null_count()

id,title,category,thumbnail_link,artists_link,genes_link,similar_link,description
u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0


## Artworks Feature Engineering

In [106]:
artworks_df = compute_features_artworks(artworks_df)
artworks_df.shape


(18538, 5)

In [107]:
artworks_df.head()

id,title,category,thumbnail_link,description
str,str,str,str,str
"""4d8b92eb4eb68a1b2c000968""","""Der Kuss (The Kiss)""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image is rich in golds and…"
"""4d8b92ee4eb68a1b2c0009ab""","""The Third of May""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a somber pa…"
"""4d8b93394eb68a1b2c0010fa""","""The Company of Frans Banning C…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a rich pale…"
"""4d8b937c4eb68a1b2c001722""","""Mona Lisa""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image is characterized by …"
"""4d8b93b04eb68a1b2c001b9d""","""Luncheon on the Grass (Le Déje…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a palette o…"


### Create embeddings from the articles description

In [108]:
for i, desc in enumerate(artworks_df["description"].head(n=3)):
    logger.info(f"Item {i+1}:\n{desc}")

[32m2024-12-19 19:44:11.410[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 1:
The image is rich in golds and soft pastels, creating a warm, harmonious atmosphere. The mood is intimate and tender, evoking feelings of love and closeness. The aesthetic features intricate patterns and floral motifs, reminiscent of the Art Nouveau style. Overall, it radiates a sense of tranquility and beauty.[0m
[32m2024-12-19 19:44:11.411[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 2:
The image features a somber palette dominated by dark tones and stark contrasts, highlighting the brutality of the scene. The mood conveys a sense of despair and urgency, evoking empathy for the victims. The dramatic lighting, particularly the lantern's glow, creates a tension between hope and tragedy. Overall, the aesthetic is visceral and haunting, capturing the raw emotion of human conflict.[0m
[32m2024-12-19 19:44:11.412[0m | [1mINFO    [0m

In [109]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
logger.info(
    f"Loading '{settings.FEATURES_EMBEDDING_MODEL_ID}' embedding model to {device=}"
)

# Load the embedding model from SentenceTransformer's model registry.
model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)

[32m2024-12-19 19:44:40.870[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mLoading 'all-MiniLM-L6-v2' embedding model to device='mps'[0m


2024-12-19 19:44:40,871 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [111]:
embedding = generate_embeddings_for_dataframe(
    artworks_df.head(1), "description", model, batch_size=128
)['embeddings']
embedding

Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  6.43it/s]


embeddings
list[f64]
"[-0.008282, 0.030209, … -0.003906]"


In [112]:
type(artworks_df)

polars.dataframe.frame.DataFrame

In [113]:
artworks_df = artworks_df.with_columns(
    pl.lit(embedding[0].to_list()).alias("embeddings")
)
artworks_df.head(3)

id,title,category,thumbnail_link,description,embeddings
str,str,str,str,str,list[f64]
"""4d8b92eb4eb68a1b2c000968""","""Der Kuss (The Kiss)""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image is rich in golds and…","[-0.008282, 0.030209, … -0.003906]"
"""4d8b92ee4eb68a1b2c0009ab""","""The Third of May""","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a somber pa…","[-0.008282, 0.030209, … -0.003906]"
"""4d8b93394eb68a1b2c0010fa""","""The Company of Frans Banning C…","""Painting""","""https://d32dm0rphc51dk.cloudfr…","""The image features a rich pale…","[-0.008282, 0.030209, … -0.003906]"


In [115]:
artworks_df = generate_embeddings_for_dataframe(
    artworks_df, "description", model, batch_size=128
)  # Reduce batch size if getting OOM errors.

Generating embeddings: 100%|██████████| 18538/18538 [01:41<00:00, 182.40it/s]


For each artwork description, we have a numerical vector which we can feed to a model, opposite to a string containing the description of an object.

In [116]:
artworks_df[["description", "embeddings"]].head(3)

description,embeddings
str,list[f64]
"""The image is rich in golds and…","[-0.008282, 0.030209, … -0.003906]"
"""The image features a somber pa…","[0.058479, 0.083046, … 0.008479]"
"""The image features a rich pale…","[0.012057, 0.037521, … 0.004547]"


## Looking at image links

In [117]:
artworks_df["thumbnail_link"][0]

'https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa-3r51Cg9qXKbfA/medium.jpg'

In [118]:
from IPython.display import HTML, display

image_urls = artworks_df["thumbnail_link"].tail(12).to_list()
grid_html = '<div style="display: grid; grid-template-columns: repeat(6, 1fr); gap: 10px; max-width: 900px;">'

for url in image_urls:
    grid_html += f'<img src="{url}" style="width: 100%; height: auto;">'

grid_html += "</div>"

display(HTML(grid_html))
