<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Setup

In [1]:
!pip install hopsworks

Collecting hopsworks
  Using cached hopsworks-3.4.4-py3-none-any.whl
Collecting hsfs<3.5.0,>=3.4.0 (from hsfs[python]<3.5.0,>=3.4.0->hopsworks)
  Using cached hsfs-3.4.7-py3-none-any.whl
Collecting hsml<3.5.0,>=3.4.0 (from hopsworks)
  Using cached hsml-3.4.6-py3-none-any.whl
Collecting pyhumps==1.6.1 (from hopsworks)
  Using cached pyhumps-1.6.1-py3-none-any.whl.metadata (3.7 kB)
Collecting furl (from hopsworks)
  Using cached furl-2.1.3-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting pyjks (from hopsworks)
  Using cached pyjks-20.0.0-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting avro==1.11.0 (from hsfs<3.5.0,>=3.4.0->hsfs[python]<3.5.0,>=3.4.0->hopsworks)
  Using cached avro-1.11.0-py2.py3-none-any.whl
Collecting PyMySQL[rsa] (from hsfs<3.5.0,>=3.4.0->hsfs[python]<3.5.0,>=3.4.0->hopsworks)
  Using cached PyMySQL-1.1.0-py3-none-any.whl.metadata (4.4 kB)
Collecting great-expectations==0.14.13 (from hsfs<3.5.0,>=3.4.0->hsfs[python]<3.5.0,>=3.4.0->hopsworks)
  Using cached great_ex

In [None]:
!pip install catboost==1.1.1

<div style="background-color:teal; color:white; padding:5px; font-size:20px">
Imports

In [2]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, precision_recall_fscore_support
import joblib
from catboost import CatBoostClassifier, Pool

import importlib
import sys
sys.path.append('./scripts')

In [3]:
import boto3

# Initialize a client for Systems Manager
ssm = boto3.client('ssm', region_name='us-east-1')

parameter_name = 'hopsworks-api-key'

# Fetch the parameter
response = ssm.get_parameter(Name=parameter_name, WithDecryption=True)

# Extract the parameter value (API key in this case)
hopsworks_api_key = response['Parameter']['Value']

<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Hopsworks Feature Store Connection

In [5]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/475285
Connected. Call `.close()` to terminate connection gracefully.


<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Data Import

In [6]:
# Fetch Feature Groups

customers_fg = fs.get_feature_group(name="customers", version=1)

articles_fg = fs.get_feature_group(name="articles", version=1)

trans_fg = fs.get_feature_group(name="transactions", version=1)

rank_fg = fs.get_feature_group(name="ranking", version=2)

In [7]:
'''Create Feature Views'''

# Customers
selected_features_customers = customers_fg.select_all()

fs.get_or_create_feature_view( 
    name='customers',
    query=selected_features_customers,
    version=1,
)

# Articles
selected_features_articles = articles_fg.select_all()

fs.get_or_create_feature_view(
    name='articles',
    query=selected_features_articles,
    version=1,
)

# Ranking
selected_features_ranking = rank_fg.select_except(["customer_id", "article_id"])

feature_view_ranking = fs.get_or_create_feature_view(
    name='ranking',
    query=selected_features_ranking,
    labels=["label"],
    version=2,
)

<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Training Data

In [8]:
X_train, X_val, y_train, y_val = feature_view_ranking.train_test_split(
    test_size=0.1,
    description='Ranking training dataset',
)

X_train.head(3)

Finished: Reading data from Hopsworks, using ArrowFlight (117.54s) 




Unnamed: 0,age,month_sin,month_cos,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name
0,27.0,-1.0,-1.83697e-16,Boots,Shoes,Solid,Black,Dark,Black,Divided Shoes,Divided,Divided,Divided Accessories,Shoes
2,22.0,-1.0,-1.83697e-16,Vest top,Garment Upper body,Solid,Black,Dark,Black,Jersey Fancy DS,Divided,Divided,Divided Selected,Jersey Fancy
4,36.0,-1.0,-1.83697e-16,Sweater,Garment Upper body,Solid,Black,Dark,Black,Knitwear,Ladieswear,Ladieswear,Womens Tailoring,Knitwear


In [9]:
print(y_train['label'].unique())
y_train.head(3)

[0 1]


Unnamed: 0,label
0,0
2,0
4,0


<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Model Training

In [10]:
# Get Feature Cols
cat_features = list(
    X_train.select_dtypes(include=['string', 'object']).columns
)

# Convert to Catboost-optmized data structure
pool_train = Pool(X_train, y_train, cat_features=cat_features)
pool_val = Pool(X_val, y_val, cat_features=cat_features)

model = CatBoostClassifier(
    learning_rate=0.2,
    iterations=100,
    depth=10,
    scale_pos_weight=10,
    early_stopping_rounds=5,
    use_best_model=True,
)

model.fit(
    pool_train, 
    eval_set=pool_val,
)

0:	learn: 0.6672984	test: 0.6672279	best: 0.6672279 (0)	total: 5.68s	remaining: 9m 22s
1:	learn: 0.6484568	test: 0.6486039	best: 0.6486039 (1)	total: 10s	remaining: 8m 11s
2:	learn: 0.6356744	test: 0.6359217	best: 0.6359217 (2)	total: 14.1s	remaining: 7m 37s
3:	learn: 0.6254611	test: 0.6258056	best: 0.6258056 (3)	total: 17.9s	remaining: 7m 9s
4:	learn: 0.6186147	test: 0.6189046	best: 0.6189046 (4)	total: 21.6s	remaining: 6m 49s
5:	learn: 0.6134491	test: 0.6137101	best: 0.6137101 (5)	total: 25.3s	remaining: 6m 35s
6:	learn: 0.6082789	test: 0.6085570	best: 0.6085570 (6)	total: 29s	remaining: 6m 25s
7:	learn: 0.6038313	test: 0.6040795	best: 0.6040795 (7)	total: 32.8s	remaining: 6m 17s
8:	learn: 0.6006690	test: 0.6008774	best: 0.6008774 (8)	total: 36.5s	remaining: 6m 8s
9:	learn: 0.5982598	test: 0.5984656	best: 0.5984656 (9)	total: 40.2s	remaining: 6m 1s
10:	learn: 0.5955777	test: 0.5956999	best: 0.5956999 (10)	total: 43.9s	remaining: 5m 55s
11:	learn: 0.5938316	test: 0.5939404	best: 0.593

<catboost.core.CatBoostClassifier at 0x7f5126f761d0>

<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Model Validation

In [11]:
preds = model.predict(pool_val)

precision, recall, fscore, _ = precision_recall_fscore_support(y_val, preds, average="binary")

metrics = {
    "precision" : precision,
    "recall" : recall,
    "fscore" : fscore,
}
print(classification_report(y_val, preds))

              precision    recall  f1-score   support

           0       0.96      0.65      0.78    445900
           1       0.21      0.78      0.33     52537

    accuracy                           0.66    498437
   macro avg       0.59      0.72      0.55    498437
weighted avg       0.88      0.66      0.73    498437



FEATURE IMPORTANCE

In [12]:
feat_to_score = {
    feature: score 
    for feature, score 
    in zip(
        X_train.columns, 
        model.feature_importances_,
    )
}

feat_to_score = dict(
    sorted(
        feat_to_score.items(),
        key=lambda item: item[1],
        reverse=True,
    )
)
feat_to_score

{'age': 13.505568491622682,
 'section_name': 12.532571454715532,
 'index_group_name': 11.571737528360387,
 'department_name': 9.089974856665528,
 'graphical_appearance_name': 8.692475054793492,
 'garment_group_name': 8.488023545500859,
 'product_type_name': 8.487487768947958,
 'perceived_colour_value_name': 6.649502564750725,
 'product_group_name': 6.180066914616397,
 'colour_group_name': 6.045853099100066,
 'perceived_colour_master_name': 5.168143576791541,
 'index_name': 3.571679652831134,
 'month_sin': 0.010395166513608067,
 'month_cos': 0.006520324790075298}

<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Save and Upload to Registry

In [15]:
joblib.dump(model, 'ranking_model.pkl')

['ranking_model.pkl']

In [13]:
# Connect to Hopsworks Model Registry
mr = project.get_model_registry()

Connected. Call `.close()` to terminate connection gracefully.


In [16]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_example = X_train.sample().to_dict("records")
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

ranking_model = mr.python.create_model(
    name="ranking_model", 
    metrics=metrics,
    model_schema=model_schema,
    input_example=input_example,
    description="Ranking model that scores item candidates",
)
ranking_model.save("model_artifacts/ranking_model.pkl")

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/6186039 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/477 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/1274 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/475285/models/ranking_model/1


Model(name: 'ranking_model', version: 1)