In [1]:
HOPSWORKS_PROJECT_NAME = 'taxi_demand_nyc'

In [2]:
import os
from dotenv import load_dotenv
from src.paths import PARENT_DIR

# load key-value pairs from .env file located in the parent directory
load_dotenv(PARENT_DIR / '.env')

HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']

if HOPSWORKS_API_KEY is None:
    raise ValueError("HOPSWORKS_API_KEY not found in environment variables. Please check your .env file.")

In [3]:
from datetime import datetime
import pandas as pd
from src.data import load_raw_data

from_year = 2023
to_year = datetime.now().year
print(f'Downloading raw data from {from_year} to {to_year}')

rides = pd.DataFrame()
for year in range(from_year, to_year+1):
    
    # download data for the whole year
    rides_one_year = load_raw_data(year)
    
    # append rows
    rides = pd.concat([rides, rides_one_year])

Downloading raw data from 2023 to 2025
File 2023-01 was already in local storage
File 2023-02 was already in local storage
File 2023-03 was already in local storage
File 2023-04 was already in local storage
File 2023-05 was already in local storage
File 2023-06 was already in local storage
File 2023-07 was already in local storage
File 2023-08 was already in local storage
File 2023-09 was already in local storage
File 2023-10 was already in local storage
File 2023-11 was already in local storage
File 2023-12 was already in local storage
File 2024-01 was already in local storage
File 2024-02 was already in local storage
File 2024-03 was already in local storage
File 2024-04 was already in local storage
File 2024-05 was already in local storage
File 2024-06 was already in local storage
File 2024-07 was already in local storage
File 2024-08 was already in local storage
File 2024-09 was already in local storage
File 2024-10 was already in local storage
Downloading file 2024-11
2024-11 file

In [4]:
print(f'{len(rides)=:,}')

len(rides)=72,164,140


In [5]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

100%|██████████| 265/265 [00:02<00:00, 108.33it/s]


In [6]:
import hopsworks

In [7]:
project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

2025-01-06 22:27:06,240 INFO: Initializing external client
2025-01-06 22:27:06,241 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-06 22:27:06,706 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1208530


In [8]:
feature_store = project.get_feature_store()

In [9]:
FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'
FEATURE_GROUP_VERSION = 1

In [10]:
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key = ['pickup_location_id', 'pickup_hour'],
    event_time='pickup_hour',
)

In [11]:
feature_group.insert(ts_data, write_options={"wait_for_job": False})

Uploading Dataframe: 100.00% |██████████| Rows 4261200/4261200 | Elapsed Time: 00:28 | Remaining Time: 00:00


Launching job: time_series_hourly_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1208530/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions


(Job('time_series_hourly_feature_group_1_offline_fg_materialization', 'SPARK'),
 None)