<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Setup

<div style="background-color:teal; color:white; padding:5px; font-size:20px">
Installs

In [1]:
!pip install hopsworks

Collecting hopsworks
  Using cached hopsworks-3.4.4-py3-none-any.whl
Collecting hsfs<3.5.0,>=3.4.0 (from hsfs[python]<3.5.0,>=3.4.0->hopsworks)
  Using cached hsfs-3.4.7-py3-none-any.whl
Collecting hsml<3.5.0,>=3.4.0 (from hopsworks)
  Using cached hsml-3.4.6-py3-none-any.whl
Collecting pyhumps==1.6.1 (from hopsworks)
  Using cached pyhumps-1.6.1-py3-none-any.whl (5.0 kB)
Collecting furl (from hopsworks)
  Using cached furl-2.1.3-py2.py3-none-any.whl (20 kB)
Collecting pyjks (from hopsworks)
  Using cached pyjks-20.0.0-py2.py3-none-any.whl (45 kB)
Collecting avro==1.11.0 (from hsfs<3.5.0,>=3.4.0->hsfs[python]<3.5.0,>=3.4.0->hopsworks)
  Using cached avro-1.11.0-py2.py3-none-any.whl
Collecting PyMySQL[rsa] (from hsfs<3.5.0,>=3.4.0->hsfs[python]<3.5.0,>=3.4.0->hopsworks)
  Using cached PyMySQL-1.1.0-py3-none-any.whl.metadata (4.4 kB)
Collecting great-expectations==0.14.13 (from hsfs<3.5.0,>=3.4.0->hsfs[python]<3.5.0,>=3.4.0->hopsworks)
  Using cached great_expectations-0.14.13-py3-none-a

<div style="background-color:teal; color:white; padding:5px; font-size:20px">
Imports

In [2]:
import pandas as pd
import numpy as np

import importlib
import sys
sys.path.append('./scripts')

import feat_engineer_helpers

<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Hopsworks Feature Store Connection

In [3]:
import boto3

# Initialize a client for Systems Manager
ssm = boto3.client('ssm', region_name='us-east-1')

parameter_name = 'hopsworks-api-key'

# Fetch the parameter
response = ssm.get_parameter(Name=parameter_name, WithDecryption=True)

# Extract the parameter value (API key in this case)
hopsworks_api_key = response['Parameter']['Value']

In [4]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/475285
Connected. Call `.close()` to terminate connection gracefully.


<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Data Download & Feature Engineering

<div style="background-color: darkgreen ; color:white; padding:0px; font-size:15px">
Articles

In [None]:
articles_df = pd.read_parquet('https://repo.hops.works/dev/jdowling/articles.parquet')
print(articles_df.shape)
articles_df.head(3)

In [None]:
# Check for NaNs
articles_df.isna().sum()[articles_df.isna().sum() > 0]

In [None]:
from feat_engineer_helpers import prepare_articles

articles_df = prepare_articles(articles_df)
articles_df.head(3)

<div style="background-color: darkgreen ; color:white; padding:0px; font-size:15px">
Customers

In [None]:
customers_df = pd.read_parquet('https://repo.hops.works/dev/jdowling/customers.parquet')
print(customers_df.shape)
customers_df.head(3)

In [None]:
# Check for NaNs
customers_df.isna().sum()[customers_df.isna().sum() > 0]

In [None]:
from feat_engineer_helpers import prepare_customers

customers_df = prepare_customers(customers_df)
print(len(customers_df))
customers_df.head(3)

<div style="background-color: darkgreen ; color:white; padding:0px; font-size:15px">
Transactions

In [None]:
trans_df = pd.read_parquet('https://repo.hops.works/dev/jdowling/transactions_train.parquet')[:600000]
print(trans_df.shape)
trans_df.head(3)

In [None]:
# Check for NaNs
trans_df.isna().sum()[trans_df.isna().sum() > 0]

In [None]:
from feat_engineer_helpers import prepare_transactions

trans_df = prepare_transactions(trans_df)
print(f"There are {len(trans_df):,} transactions in total.")
trans_df.head(3)

<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Create Feature Groups (Hopsworks)

<div style="background-color: darkgreen ; color:white; padding:0px; font-size:15px">
Items

In [None]:
# Define Feature Group
articles_fg = fs.get_or_create_feature_group(
    name="articles",
    description="Fashion items data including type of item, visual description and category",
    version=1,
    primary_key=["article_id"],
    online_enabled=True
)
# Load Data
articles_fg.insert(articles_df)

In [None]:
# Add Feature Descriptions
feature_descriptions = [
    {"name": "article_id", "description": "Identifier for the article."},
    {"name": "product_code", "description": "Code associated with the product."},
    {"name": "prod_name", "description": "Name of the product."},
    {"name": "product_type_no", "description": "Number associated with the product type."},
    {"name": "product_type_name", "description": "Name of the product type."},
    {"name": "product_group_name", "description": "Name of the product group."},
    {"name": "graphical_appearance_no", "description": "Number associated with graphical appearance."},
    {"name": "graphical_appearance_name", "description": "Name of the graphical appearance."},
    {"name": "colour_group_code", "description": "Code associated with the colour group."},
    {"name": "colour_group_name", "description": "Name of the colour group."},
    {"name": "perceived_colour_value_id", "description": "ID associated with perceived colour value."},
    {"name": "perceived_colour_value_name", "description": "Name of the perceived colour value."},
    {"name": "perceived_colour_master_id", "description": "ID associated with perceived colour master."},
    {"name": "perceived_colour_master_name", "description": "Name of the perceived colour master."},
    {"name": "department_no", "description": "Number associated with the department."},
    {"name": "department_name", "description": "Name of the department."},
    {"name": "index_code", "description": "Code associated with the index."},
    {"name": "index_name", "description": "Name of the index."},
    {"name": "index_group_no", "description": "Number associated with the index group."},
    {"name": "index_group_name", "description": "Name of the index group."},
    {"name": "section_no", "description": "Number associated with the section."},
    {"name": "section_name", "description": "Name of the section."},
    {"name": "garment_group_no", "description": "Number associated with the garment group."},
    {"name": "garment_group_name", "description": "Name of the garment group."},
    {"name": "prod_name_length", "description": "Length of the product name."},
]

for desc in feature_descriptions: 
    articles_fg.update_feature_description(desc["name"], desc["description"])

<div style="background-color: darkgreen ; color:white; padding:0px; font-size:15px">
Customers

In [None]:
# Define Feature Group
customers_fg = fs.get_or_create_feature_group(
    name="customers",
    description="Customers data including age and postal code",
    version=1,
    primary_key=["customer_id"],
    online_enabled=True,  # Low latency access to data
)
# Load Data
customers_fg.insert(customers_df)

In [None]:
# Add Feature Descriptions
feature_descriptions = [
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "club_member_status", "description": "Membership status of the customer in the club."},
    {"name": "age", "description": "Age of the customer."},
    {"name": "postal_code", "description": "Postal code associated with the customer's address."},
    {"name": "age_group", "description": "Categorized age group of the customer."},
]

for desc in feature_descriptions:
    customers_fg.update_feature_description(desc["name"], desc["description"])

<div style="background-color: darkgreen ; color:white; padding:0px; font-size:15px">
Transactions

In [None]:
trans_fg = fs.get_or_create_feature_group(
    name="transactions",
    version=1,
    description="Transactions data including customer, item, price, sales channel and transaction date",
    primary_key=["customer_id", "article_id"],
    online_enabled=True,
    event_time="t_dat",
)
trans_fg.insert(
    trans_df,
    write_options={"wait_for_job": True},
)

In [None]:
feature_descriptions = [
    {"name": "t_dat", "description": "Timestamp of the data record."},
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the purchased article."},
    {"name": "price", "description": "Price of the purchased article."},
    {"name": "sales_channel_id", "description": "Identifier for the sales channel."},
    {"name": "year", "description": "Year of the transaction."},
    {"name": "month", "description": "Month of the transaction."},
    {"name": "day", "description": "Day of the transaction."},
    {"name": "day_of_week", "description": "Day of the week of the transaction."},
    {"name": "month_sin", "description": "Sine of the month used for seasonal patterns."},
    {"name": "month_cos", "description": "Cosine of the month used for seasonal patterns."},
]

for desc in feature_descriptions:
    trans_fg.update_feature_description(desc["name"], desc["description"])

<div style="background-color:moccasin; color:black; padding:5px; font-size:20px">
Create Rankings

In [5]:
# Load Feature Groups From Hopsworks
trans_fg = fs.get_feature_group(name="transactions",version=1,)
customers_fg = fs.get_feature_group(name="customers",version=1,)
articles_fg = fs.get_feature_group(name="articles",version=1,)

In [6]:
def compute_ranking_dataset(trans_fg, articles_fg, customers_fg):

    # DATASET JOINS & FILTERING
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
    # Joined-Dataset Creation & Relevant Feature Filtering
    
    # Define the features used in the query
    query_features = ["customer_id", "age", "month_sin", "month_cos", "article_id"]
    
    # Perform the necessary joins to create the feature set
    fg_query = trans_fg.select(["month_sin", "month_cos"]).join(articles_fg.select_all(), on=["article_id"]).join(customers_fg.select(["customer_id", "age"]))
    df = fg_query.read()
    df = df[query_features]

    # NEG SAMPLE GENERATION
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Creates negative samples by randomly sampling from 3 sets:
    # article_id, customer_id & age-monthsin-monthcos feature chunks

    # Copy the positive pairs for ranking
    positive_pairs = df.copy()
    
    # Define the number of negative pairs to generate
    n_neg = len(positive_pairs) * 10

    # Create new id_list by sampling from unique list of positive samples
    negative_pairs = positive_pairs["article_id"].drop_duplicates().sample(n_neg, replace=True, random_state=2).to_frame()

    # Add customer_id to negative_pairs
    negative_pairs["customer_id"] = positive_pairs["customer_id"].sample(n_neg, replace=True, random_state=3).to_numpy()

    # Add other features to negative_pairs
    negative_pairs[["age", "month_sin", "month_cos"]] = positive_pairs[["age", "month_sin", "month_cos"]].sample(n_neg, replace=True, random_state=4).to_numpy()
    
    # Add labels to positive and negative pairs
    positive_pairs["label"] = 1
    negative_pairs["label"] = 0
    
    # FINISHING TOUCHES
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    
    # Concatenate positive and negative pairs
    ranking_df = pd.concat([positive_pairs, negative_pairs], ignore_index=True)

    # Keep unique article_id from item features
    item_df = articles_fg.read()
    item_df.drop_duplicates(subset="article_id", inplace=True)
    
    # Keep only the necessary columns from item features
    item_df = item_df[["article_id", "product_type_name", "product_group_name", "graphical_appearance_name", "colour_group_name", "perceived_colour_value_name", 
                       "perceived_colour_master_name", "department_name", "index_name", "index_group_name", "section_name", "garment_group_name"]]
    
    # Merge with item features
    ranking_df = ranking_df.merge(item_df, on="article_id")
    
    return ranking_df

In [7]:
ranking_df = compute_ranking_dataset(
    trans_fg, 
    articles_fg, 
    customers_fg,
)

Finished: Reading data from Hopsworks, using ArrowFlight (214.56s) 
df len:542409
pos_pairs len:542409
neg_pairs len:5424090
Finished: Reading data from Hopsworks, using ArrowFlight (9.47s) 
item_df len:105542
item_df cols:Index(['article_id', 'product_type_name', 'product_group_name',
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name', 'perceived_colour_master_name',
       'department_name', 'index_name', 'index_group_name', 'section_name',
       'garment_group_name'],
      dtype='object')
ranking_df cols:Index(['customer_id', 'age', 'month_sin', 'month_cos', 'article_id', 'label'], dtype='object')
ranking_df len:5966499
ranking_df unique:[1 0]


In [12]:
ranking_df.head()

Unnamed: 0,customer_id,age,month_sin,month_cos,article_id,label,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name
0,e2eb4cae50256f162513c51f2ac209dcaed12dab7cf168...,36.0,-1.0,-1.83697e-16,618839003,1,Sweater,Garment Upper body,Stripe,Dark Blue,Bright,Blue,Baby Girl Knitwear,Baby Sizes 50-98,Baby/Children,Baby Girl,Knitwear
1,977955d53827da368f4b903124bac0c2a72596c7191538...,25.0,-1.0,-1.83697e-16,618839003,1,Sweater,Garment Upper body,Stripe,Dark Blue,Bright,Blue,Baby Girl Knitwear,Baby Sizes 50-98,Baby/Children,Baby Girl,Knitwear
2,530cc2dd646023036d8c6ac15bc3ea7a519fafafdb7935...,37.0,-1.0,-1.83697e-16,618839003,1,Sweater,Garment Upper body,Stripe,Dark Blue,Bright,Blue,Baby Girl Knitwear,Baby Sizes 50-98,Baby/Children,Baby Girl,Knitwear
3,04a4edbfccba7339ce2eec1d67b08397614e35d2676daa...,33.0,-1.0,-1.83697e-16,618839003,1,Sweater,Garment Upper body,Stripe,Dark Blue,Bright,Blue,Baby Girl Knitwear,Baby Sizes 50-98,Baby/Children,Baby Girl,Knitwear
4,332a319e053d492852344637fd5f5debafc07e7faaa15e...,36.0,-1.0,-1.83697e-16,618839003,1,Sweater,Garment Upper body,Stripe,Dark Blue,Bright,Blue,Baby Girl Knitwear,Baby Sizes 50-98,Baby/Children,Baby Girl,Knitwear


<div style="background-color: darkgreen ; color:white; padding:0px; font-size:15px">
Add Rankings Feature Group

In [13]:
rank_fg = fs.get_or_create_feature_group(
    name="ranking",
    version=2,
    description="Derived feature group for ranking",
    primary_key=["customer_id", "article_id"], 
    parents=[articles_fg, customers_fg, trans_fg],
)
rank_fg.insert(ranking_df)

Uploading Dataframe: 0.00% |          | Rows 0/5966499 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: ranking_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/475285/jobs/named/ranking_2_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x7f0112b26c50>, None)

In [14]:
feature_descriptions = [
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the purchased article."},
    {"name": "age", "description": "Age of the customer."},
    {"name": "month_sin", "description": "Sine of the month used for seasonal patterns."},
    {"name": "month_cos", "description": "Cosine of the month used for seasonal patterns."},
    {"name": "product_type_name", "description": "Name of the product type."},
    {"name": "product_group_name", "description": "Name of the product group."},
    {"name": "graphical_appearance_name", "description": "Name of the graphical appearance."},
    {"name": "colour_group_name", "description": "Name of the colour group."},
    {"name": "perceived_colour_value_name", "description": "Name of the perceived colour value."},
    {"name": "perceived_colour_master_name", "description": "Name of the perceived colour master."},
    {"name": "department_name", "description": "Name of the department."},
    {"name": "index_name", "description": "Name of the index."},
    {"name": "index_group_name", "description": "Name of the index group."},
    {"name": "section_name", "description": "Name of the section."},
    {"name": "garment_group_name", "description": "Name of the garment group."},
    {"name": "label", "description": "Label indicating whether the article was purchased (1) or not (0)."},
]

for desc in feature_descriptions: 
    rank_fg.update_feature_description(desc["name"], desc["description"])