In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [2]:
import hopsworks
import pandas as pd
from datetime import timedelta
from src.inference import get_feature_store, fetch_predictions

def fetch_hourly_rides(hours):
    current_hour = (pd.Timestamp.now(tz="Etc/UTC") - timedelta(hours=hours)).floor('h')
    current_hour = current_hour.replace(year=2024)

    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=1
    )

    query = fg.select_all()
    query = query.filter(fg.pickup_hour >= current_hour)

    return query.read(read_options={"arrow_flight_config": {"timeout": 600}})

In [3]:
df = fetch_hourly_rides(1)

2025-05-11 02:24:31,554 INFO: Initializing external client
2025-05-11 02:24:31,554 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-11 02:24:33,455 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214680
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (24.65s) 


In [4]:
df

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2024-12-03 08:00:00+00:00,3211.06,0
1,2024-06-22 00:00:00+00:00,4137.14,0
2,2024-10-17 08:00:00+00:00,4425.02,7
3,2024-07-24 01:00:00+00:00,4007.01,0
4,2024-11-20 23:00:00+00:00,3759.07,1
...,...,...,...
3208886,2024-07-09 09:00:00+00:00,3931.09,0
3208887,2024-07-01 23:00:00+00:00,3657.06,2
3208888,2024-06-21 18:00:00+00:00,3266.01,0
3208889,2024-07-24 02:00:00+00:00,3520.01,0


In [5]:
from src.inference import fetch_next_hour_predictions
df_pred = fetch_next_hour_predictions()

2025-05-11 02:26:18,886 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-11 02:26:18,893 INFO: Initializing external client
2025-05-11 02:26:18,893 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-11 02:26:19,497 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214680
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.81s) 
Current UTC time: 2025-05-11 06:26:18.886229+00:00
Next hour: 2024-05-11 07:00:00+00:00
Found 568 records


In [6]:
df_pred

Unnamed: 0,pickup_location_id,predicted_demand,pickup_hour
4544,3504.02,1.0,2024-05-11 07:00:00+00:00
4545,4225.14,5.0,2024-05-11 07:00:00+00:00
4546,3230.01,0.0,2024-05-11 07:00:00+00:00
4547,4187.04,0.0,2024-05-11 07:00:00+00:00
4548,4116.09,4.0,2024-05-11 07:00:00+00:00
...,...,...,...
5107,4095.10,5.0,2024-05-11 07:00:00+00:00
5108,2951.05,0.0,2024-05-11 07:00:00+00:00
5109,4266.03,4.0,2024-05-11 07:00:00+00:00
5110,3166.03,1.0,2024-05-11 07:00:00+00:00


In [7]:
merged_df = pd.merge(df, df_pred, on=['pickup_location_id', 'pickup_hour'])

In [8]:
merged_df

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand
0,2024-05-11 07:00:00+00:00,4428.02,1,2.0
1,2024-05-11 07:00:00+00:00,4148.07,0,1.0
2,2024-05-11 07:00:00+00:00,3388.02,0,1.0
3,2024-05-11 07:00:00+00:00,3309.04,0,0.0
4,2024-05-11 07:00:00+00:00,4243.01,1,1.0
...,...,...,...,...
563,2024-05-11 07:00:00+00:00,4019.06,5,7.0
564,2024-05-11 07:00:00+00:00,4237.01,0,1.0
565,2024-05-11 07:00:00+00:00,3535.03,1,1.0
566,2024-05-11 07:00:00+00:00,4368.05,1,8.0


In [9]:
merged_df.sort_values(["pickup_location_id", "pickup_hour"])

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand
542,2024-05-11 07:00:00+00:00,190 Morgan,0,-0.0
46,2024-05-11 07:00:00+00:00,2733.03,0,-0.0
466,2024-05-11 07:00:00+00:00,2782.02,0,1.0
15,2024-05-11 07:00:00+00:00,2821.05,0,0.0
181,2024-05-11 07:00:00+00:00,2821.06,0,0.0
...,...,...,...,...
337,2024-05-11 07:00:00+00:00,4735.03,0,3.0
390,2024-05-11 07:00:00+00:00,4735.16,0,1.0
440,2024-05-11 07:00:00+00:00,4743.04,0,0.0
190,2024-05-11 07:00:00+00:00,4748.07,0,2.0


In [10]:
import pandas as pd  
import plotly.express as px
df1 = df
df2 = df_pred

# Merge the DataFrames on 'pickup_location_id' and 'pickup_hour'  
merged_df = pd.merge(df1, df2, on=['pickup_location_id', 'pickup_hour'])  

# Calculate the absolute error  
merged_df['absolute_error'] = abs(merged_df['predicted_demand'] - merged_df['rides'])  

# Group by 'pickup_hour' and calculate the mean absolute error (MAE)  
mae_by_hour = merged_df.groupby('pickup_hour')['absolute_error'].mean().reset_index()  
mae_by_hour.rename(columns={'absolute_error': 'MAE'}, inplace=True)  

# Create a Plotly plot  
fig = px.line(  
    mae_by_hour,  
    x='pickup_hour',  
    y='MAE',  
    title='Mean Absolute Error (MAE) by Pickup Hour',  
    labels={'pickup_hour': 'Pickup Hour', 'MAE': 'Mean Absolute Error'},  
    markers=True  
)  

# Show the plot  
fig.show()