In [1]:
%reload_ext autoreload
%autoreload 2

In [47]:
import pandas as pd

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

from src.model import forwardfill_missing_values, make_pipeline, save_model
from src.paths import MODEL_DIR
import src.config as config


from src.data import split_data, transform_training_data


In [11]:
import hopsworks

# connect to project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY,
)

# connect to feature store
feature_store = project.get_feature_store()

# connect to feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

2025-03-18 13:54:43,468 INFO: Initializing external client
2025-03-18 13:54:43,468 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-18 13:54:45,671 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1051798


In [None]:
# Create feature view if it doesn't exist already
try:
    # Create feature view if it doesn't exist
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_GROUP_VERSION,
        query=feature_group.select_all(),  # Create from all features in FG
    )
except:
    print("Feature view already existed, skip creation.")

In [12]:
# Get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME, version=config.FEATURE_VIEW_VERSION
)

In [None]:
data, _ = feature_view.training_data(
    description="Daily demand",
    )

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.98s) 



In [35]:
def prepare_feature_store_data_for_training(data: pd.DataFrame) -> pd.DataFrame:
    """
    Prepares feature store data for training. 

    Args:
        data: dataframe from Hopsworks feature store

    Returns:
        pd.DataFrame
    """

    from src.config import BAS

    data_ = data.copy()

    # Filter out unwanted BAs
    data_ = data_[data_["ba_code"].isin(BAS)].copy()

    data_["datetime"] = pd.to_datetime(data_["datetime"])
    data_["datetime"] = data_["datetime"].dt.tz_localize(None)

    data_.sort_values(by=["ba_code", "datetime"], inplace=True)

    data_ = forwardfill_missing_values(data_)

    return data_

In [42]:
demand = prepare_feature_store_data_for_training(data)

In [43]:
# Training pipeline

# 0. Constants
train_end = "2024-03-01"
days_of_historic_data = 365

# 1. Split Data
X_train, y_train, X_test, y_test = split_data(
    demand, train_end=train_end, days_of_historic_data=days_of_historic_data
)

# 2. Transform Data
pipe = make_pipeline()
print(pipe)

X_train_t, y_train_t, X_test_t, y_test_t = transform_training_data(
    X_train, y_train, X_test, y_test, pipe
)

# 3. Train model
lgbm = LGBMRegressor()

lgbm.fit(X_train_t, y_train_t)

# 4. Evaluate model
preds_train = lgbm.predict(X_train_t)
preds_test = lgbm.predict(X_test_t)

mae_train = mean_absolute_error(preds_train, y_train_t)
mae_test = mean_absolute_error(preds_test, y_test_t)

print(f"Training score:\t{mae_train}")
print(f"Test score:\t{mae_test}")

# plot_predictions_against_actuals(preds_train, y_train_t, preds_test, y_test_t)
# plot_residuals(preds_train, y_train_t, preds_test, y_test_t)

Data successfully split at 2024-03-01:
	X_train.shape=(62858, 3): 2020-12-01 --- 2024-02-29
	y_train.shape=(62858,)
	X_test.shape=(39538, 3): 2023-03-02 --- 2025-03-16
	y_test.shape=(39538,)
Pipeline(steps=[('datetime',
                 DatetimeFeatures(drop_original=False,
                                  features_to_extract=['month', 'week',
                                                       'day_of_week',
                                                       'day_of_month',
                                                       'weekend'],
                                  variables=['datetime'])),
                ('lags',
                 FeatureEngineerByBA(transformer=LagFeatures(periods=[1, 2, 3,
                                                                      4, 5, 6,
                                                                      7, 30,
                                                                      180,
                                                  

In [45]:
save_model(model=lgbm, filename='lgbm.pkl')

In [46]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train_t)
output_schema = Schema(y_train_t)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [48]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="daily_demand_predictor",
    metrics={"test_mae": mae_test},
    description="LightGBM regressor",
    input_example=X_train_t.sample(),
    model_schema=model_schema,
)

model.save(str(MODEL_DIR / "lgbm.pkl"))

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/289778 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/485 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/2385 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1051798/models/daily_demand_predictor/1


Model(name: 'daily_demand_predictor', version: 1)