In [None]:
from src.config import (
    HOPSWORKS_API_KEY, 
    HOPSWORKS_PROJECT_NAME, 
    FEATURE_GROUP_NAME, 
    FEATURE_GROUP_VERSION
    )

import hopsworks

In [None]:
from datetime import datetime, timedelta
import pandas as pd

current_date = datetime.now().date()

# Only need one days worth but add redundancy in case fetching fails
from_date = current_date - timedelta(days=32)

In [None]:
from src.data import download_new_batch_of_data, fill_missing_demand_values

def fetch_batch_raw_data(from_date: datetime, to_date: datetime) -> pd.DataFrame:
    """
    Downloads raw data between {from_date} and {to_date}.

    Args:
        from_date: date that we want the data to range from
        to_date: date that we want the data to range to

    Returns:
        Dataframe of demand
    """
    # Download full month
    from_batch = download_new_batch_of_data(from_date.year, from_date.month)
    # Filter out unwanted rows
    from_batch = from_batch[from_batch["datetime"] >= from_date]

    # Download full month
    to_batch = download_new_batch_of_data(to_date.year, to_date.month)
    # Filter out unwanted rows
    to_batch = to_batch[to_batch["datetime"] < to_date]

    data = pd.concat([from_batch, to_batch])

    # To deal with downcasting when filling NaNs
    data["demand"] = data["demand"].astype(int)

    data = fill_missing_demand_values(data)

    data.sort_values(by=["ba_code", "datetime"], inplace=True)

    return data

In [42]:
data = fetch_batch_raw_data(from_date=from_date, to_date=current_date)

100%|██████████| 67/67 [00:00<00:00, 1098.86it/s]


In [44]:
project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY,
)

feature_store = project.get_feature_store()

feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="Daily demand",
    primary_key=["ba_code", "datetime"],
    event_time="datetime",
)

2024-12-09 16:54:02,965 INFO: Closing external client and cleaning up certificates.
Connection closed.
2024-12-09 16:54:02,972 INFO: Initializing external client
2024-12-09 16:54:02,972 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-09 16:54:03,922 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1051798


In [45]:
feature_group.insert(data, write_options={"wait_for_job": False})

Uploading Dataframe: 100.00% |██████████| Rows 2144/2144 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: daily_demand_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1051798/jobs/named/daily_demand_feature_group_1_offline_fg_materialization/executions


(Job('daily_demand_feature_group_1_offline_fg_materialization', 'SPARK'), None)