In [None]:
import sys
from pathlib import Path

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent.parent.parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `mlfs` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

In [2]:
import hopsworks
from datetime import datetime

In [3]:
project = hopsworks.login()
fs = project.get_feature_store()

2024-12-18 15:00:46,726 INFO: Initializing external client
2024-12-18 15:00:46,727 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-18 15:00:48,787 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/398


In [3]:
transactions = fs.get_feature_group("transactions", version=1)
profiles = fs.get_feature_group("profiles", version=1)
profiles_activity_5m = fs.get_feature_group("profiles_activity_5m", version=1)
profiles_last_transaction = fs.get_feature_group("profiles_last_transaction", version=1)

In [4]:
query = transactions.select(['fraud_label', 'amount', 'category', 'time_delta_t_minus_1', 'loc_delta_t_minus_1', 'outside_city']).join(
    profiles.select_all(include_primary_key=False, include_event_time=False), on='account_id'
).join(
    profiles_activity_5m.select_all(include_primary_key=False, include_event_time=False), on='account_id'
).join(
    # Use Left join type as we don't need this data in the training dataset. This is only here to populate the inference helper columns
    profiles_last_transaction.select_all(include_primary_key=False), on='account_id', join_type="left"
)

In [5]:
query.show(5)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (19.68s) 


Unnamed: 0,fraud_label,amount,category,time_delta_t_minus_1,loc_delta_t_minus_1,outside_city,cc_provider,cc_type,city,age,cc_expiration_days,count,min_amount,max_amount,mean,last_transaction_datetime,latitude,longitude
0,0,70.99,Clothing,13.732755,0.209705,1,mastercard,debit,Lower West Side,39.589041,858,1,70.989998,70.989998,70.989998,2024-06-20 14:18:57+00:00,41.75338,-86.11084
1,0,99.2,Grocery,9.005637,0.164352,1,mastercard,debit,Lompoc,87.29863,1589,1,99.199997,99.199997,99.199997,2024-06-19 08:03:09+00:00,33.54428,-84.23381
2,0,32.6,Health/Beauty,0.763137,0.066322,1,visa,credit,Evergreen Park,29.413699,309,1,32.599998,32.599998,32.599998,2024-06-21 23:46:55+00:00,40.5576,-74.28459
3,0,88.61,Grocery,2.731215,0.193062,1,visa,credit,Martinsburg,54.627397,1284,1,88.610001,88.610001,88.610001,2024-06-22 13:13:04+00:00,41.75338,-86.11084
4,0,191.32,Electronics,2.942025,0.318032,1,visa,debit,Santa Maria,59.717808,97,1,191.320007,191.320007,191.320007,2024-06-24 12:59:11+00:00,39.32288,-76.72803


In [6]:
# Load transformation functions to encode categorical features
label_encoder = fs.get_transformation_function(name="label_encoder")

In [8]:
# Register the feature view with the feature store
fraud_model_fv = fs.get_or_create_feature_view(
    name="fraud_model_fv",
    version=1,
    description="Fraud model feature view",
    query=query,
    labels=['fraud_label'],
    inference_helper_columns=['city', 'last_transaction_datetime', 'latitude', 'longitude'],
    transformation_functions = {
        "category": label_encoder,
        "cc_provider": label_encoder,
        "cc_type": label_encoder
    },
)

Feature view created successfully, explore it at 
https://snurran.hops.works/p/15479/fs/15427/fv/fraud_model_fv/version/1


In [9]:
# Create training dataset for the model. 
# Split train and test data by time
train_start = datetime(year=2023, month=8, day=1, hour=0, minute=0, second=0)
train_end = datetime(year=2024, month=3, day=31, hour=0, minute=0, second=0)
test_start = datetime(year=2024, month=4, day=1, hour=0, minute=0, second=0)
test_end = datetime(year=2024, month=4, day=10, hour=0, minute=0, second=0)

fraud_model_fv.create_train_test_split(
    train_start = train_start,
    train_end = train_end,
    test_start = test_start,
    test_end = test_end,
    data_format = "csv",
    coalesce = True,
    statistics_config = {'histograms': True, 'correlations': True}
)

Training dataset job started successfully, you can follow the progress at 
https://snurran.hops.works/p/15479/jobs/named/fraud_model_fv_1_create_fv_td_26062024144050/executions



(1, <hsfs.core.job.Job at 0x7fd71c1c82e0>)