In [1]:
import pandas as pd

In [6]:
# Read in the validated data
PATH = "../data/validated/val_rides_2023_01.parquet"
rides = pd.read_parquet(PATH)
print(rides.shape)
print(rides.head(3))

(3066718, 2)
      pickup_datetime  location_id
0 2023-01-01 00:32:10          161
1 2023-01-01 00:55:08           43
2 2023-01-01 00:25:04           48


In [12]:
# Rides per hour - time series data at hourly frequency
rides['pickup_hour'] = rides.pickup_datetime.dt.floor("H")
rides

Unnamed: 0,pickup_datetime,location_id,pickup_hour
0,2023-01-01 00:32:10,161,2023-01-01 00:00:00
1,2023-01-01 00:55:08,43,2023-01-01 00:00:00
2,2023-01-01 00:25:04,48,2023-01-01 00:00:00
3,2023-01-01 00:03:48,138,2023-01-01 00:00:00
4,2023-01-01 00:10:29,107,2023-01-01 00:00:00
...,...,...,...
3066761,2023-01-31 23:58:34,107,2023-01-31 23:00:00
3066762,2023-01-31 23:31:09,112,2023-01-31 23:00:00
3066763,2023-01-31 23:01:05,114,2023-01-31 23:00:00
3066764,2023-01-31 23:40:00,230,2023-01-31 23:00:00


In [49]:
nr_rides_by_location = rides.groupby(['location_id', 'pickup_hour']).count().reset_index()
nr_rides_by_location.rename(columns = {"pickup_datetime":"nr_rides"}, inplace=True)
nr_rides_by_location

Unnamed: 0,location_id,pickup_hour,nr_rides
0,1,2023-01-01 05:00:00,1
1,1,2023-01-01 08:00:00,1
2,1,2023-01-01 11:00:00,1
3,1,2023-01-01 12:00:00,3
4,1,2023-01-01 13:00:00,4
...,...,...,...
71486,265,2023-01-31 19:00:00,2
71487,265,2023-01-31 20:00:00,1
71488,265,2023-01-31 21:00:00,3
71489,265,2023-01-31 22:00:00,5


In [61]:
from tqdm import tqdm

In [62]:
# Generate a full range time series for every location
def fill_missing_time_slots(rides:pd.DataFrame)->pd.DataFrame:
    full_df = pd.DataFrame()
    location_ids = rides.location_id.unique()
    full_range_ts = pd.date_range(rides.pickup_hour.min(), rides.pickup_hour.max(), freq = 'H')

    for location_id in tqdm(location_ids):
        location_subset = rides[rides.location_id==location_id]
        location_subset.set_index('pickup_hour', inplace=True)
        location_subset = location_subset.reindex(full_range_ts, fill_value=0)
        location_subset["location_id"] = location_id
        location_subset = location_subset.reset_index()
        location_subset.rename(columns={'index':'pickup_hour'}, inplace=True)
        full_df = pd.concat([full_df, location_subset]).reset_index(drop=True)
    return full_df

In [64]:
full_range_df = fill_missing_time_slots(rides = nr_rides_by_location)
print(len(full_range_df))

100%|██████████| 257/257 [00:00<00:00, 263.55it/s]

191208





In [66]:
full_range_df.to_parquet("../data/transformed/ts_data_2023_01.parquet")