In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import src.config as config

In [3]:
import hopsworks

# connect to project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY,
)

# connect to feature store
feature_store = project.get_feature_store()

# connect to feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1051798
Connected. Call `.close()` to terminate connection gracefully.


In [4]:
# Create feature view if it doesn't exist already
# This view only uses one feature group so the query is trivial
try:
    # Create feature view if it doesn't exist
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_GROUP_VERSION,
        query=feature_group.select_all(),
    )
except:
    print('Feature view already existed, skip creation.')
    
# # Get feature view 
# feature_view = feature_store.get_feature_view(
#     name=config.FEATURE_VIEW_NAME,
#     version=config.FEATURE_VIEW_VERSION
# )

Feature view already existed, skip creation.


In [5]:
# Get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME, version=config.FEATURE_VIEW_VERSION
)

In [6]:
ts_data, _ = feature_view.training_data(
    description='Time-series hourly taxi rides'
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (190.95s) 



In [7]:
ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)
ts_data.head()

Unnamed: 0,pickup_hour,rides,pickup_location_id
2005548,2022-01-01 00:00:00+00:00,0,1
4771557,2022-01-01 01:00:00+00:00,0,1
2378505,2022-01-01 02:00:00+00:00,0,1
2723058,2022-01-01 03:00:00+00:00,0,1
21571,2022-01-01 04:00:00+00:00,1,1


In [8]:
from src.data import transform_ts_into_features_and_targets

features, targets = transform_ts_into_features_and_targets(
    ts_data=ts_data,
    n_features=24*28, # One month
    step_size=23,
)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f"{features_and_target.shape=}")

100%|██████████| 259/259 [00:26<00:00,  9.67it/s]


features_and_target.shape=(218391, 675)


In [9]:
features_and_target['pickup_hour'].head()

0    2022-01-29 00:00:00+00:00
1    2022-01-29 23:00:00+00:00
2    2022-01-30 22:00:00+00:00
3    2022-01-31 21:00:00+00:00
4    2022-02-01 20:00:00+00:00
Name: pickup_hour, dtype: object

In [10]:
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from src.data_split import train_test_split

# Training data from  Jan 2022 up until 2 months ago
# Testing data from the last two months

cutoff_date = pd.to_datetime(date.today() - timedelta(days=28 * 2)).tz_localize("UTC")

print(f"{cutoff_date=}")

features_and_target["pickup_hour"] = pd.to_datetime(features_and_target["pickup_hour"])

X_train, y_train, X_test, y_test = train_test_split(
    df=features_and_target,
    cutoff_date=cutoff_date,
    target_column_name='target_rides_next_hour',
)

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_test.shape=}")
print(f"{y_test.shape=}")

cutoff_date=Timestamp('2024-08-02 00:00:00+0000', tz='UTC')
X_train.shape=(210954, 674)
y_train.shape=(210954,)
X_test.shape=(7437, 674)
y_test.shape=(7437,)


In [11]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline


def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyperparameters, train a model and compute an average validation error based
    on a TimeSeriesSplit
    """

    # Pick hyperparameters
    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_fraction", 3, 100),
    }

    tss = TimeSeriesSplit(n_splits=4)
    scores = []
    for train_index, val_index in tss.split(X_train):
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        # Evaluate model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)

    return np.array(scores).mean()

In [12]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1)

[I 2024-09-27 13:25:17,399] A new study created in memory with name: no-name-b0258eec-4ecc-496b-9a78-fcf7d4f168e1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-09-27 13:25:46,851] Trial 0 finished with value: 3.6164204727640845 and parameters: {'num_leaves': 46, 'feature_fraction': 0.8473435766259299, 'bagging_fraction': 0.6539914823118946, 'min_child_fraction': 97}. Best is trial 0 with value: 3.6164204727640845.


In [13]:
best_params = study.best_trial.params
best_params

{'num_leaves': 46,
 'feature_fraction': 0.8473435766259299,
 'bagging_fraction': 0.6539914823118946,
 'min_child_fraction': 97}

In [14]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [15]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
test_mae

4.296732496301819

In [16]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['/Users/douglaslazenby/Documents/Courses/RealWorldMLTutorial/taxi_demand_predictor/models/model.pkl']

In [17]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [19]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="taxi_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor with hyperparameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema,
)

model.save(str(MODELS_DIR / 'model.pkl'))

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/470473 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/3762 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/58136 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1051798/models/taxi_demand_predictor_next_hour/1


Model(name: 'taxi_demand_predictor_next_hour', version: 1)