# Imports

In [11]:
import numpy as np
import pandas as pd

import datetime

# Config

In [12]:
# Add your input and output path
INPUT_PATH = 'ride_data/'
OUTPUT_PATH = 'data/labels.parquet'
# Start date (first month) , End date (last month)
start_date, end_date = '2023-01-01', '2023-04-30'    # January --> April

# Loading Dataset

In [13]:
def load_data(path, start_date: str, end_date: str):
    df = pd.read_parquet(path) 
    # Filter data based on the date
    start_date = datetime.date.fromisoformat(start_date)
    end_date = datetime.date.fromisoformat(end_date)
    filtered_df = df[(df['tpep_pickup_datetime'].dt.date >= start_date) &
                     (df['tpep_pickup_datetime'].dt.date <= end_date)]
    # Keep essential columns
    dataset = filtered_df.filter(items=['tpep_pickup_datetime', 'PULocationID'])
    # Add a new column for pick-up date
    dataset['PU_date'] = pd.to_datetime(dataset['tpep_pickup_datetime'].dt.date)
    return dataset

In [14]:
rides_df = load_data(INPUT_PATH, start_date, end_date)
print(f'rides_df shape : {rides_df.shape}')
rides_df.head()

rides_df shape : (12672629, 3)


Unnamed: 0,tpep_pickup_datetime,PULocationID,PU_date
0,2023-01-01 00:32:10,161,2023-01-01
1,2023-01-01 00:55:08,43,2023-01-01
2,2023-01-01 00:25:04,48,2023-01-01
3,2023-01-01 00:03:48,138,2023-01-01
4,2023-01-01 00:10:29,107,2023-01-01


# Labeling

In [15]:
def labeling(dataset):
    dataset_labels = dataset.groupby(['PULocationID', 'PU_date'])['PU_date'].count().to_frame('Demand')\
        .sort_values(['PULocationID', 'PU_date'], ascending=[True, True]).reset_index()\
        .rename(columns={'PULocationID': 'Location', 'PU_date': 'Date'})
        
    locations = pd.DataFrame(dataset_labels['Location'].unique(), columns=['Location'])
    dates = pd.DataFrame(dataset_labels['Date'].unique(), columns=['Date'])
    # create a DataFrame for all locations and dates
    loc_date_df = locations.merge(dates, how='cross').\
        sort_values(['Location', 'Date'], ascending=[True, True]).reset_index(drop=True)
    # Fill the DataFrame with demands
    labels_df = loc_date_df.merge(dataset_labels, how='left', on=['Location', 'Date']).fillna(value=0)
    
    return labels_df

In [16]:
labels_df = labeling(rides_df)
print(f'labels_df shape : {labels_df.shape}')
labels_df.head()

labels_df shape : (31440, 3)


Unnamed: 0,Location,Date,Demand
0,1,2023-01-01,40.0
1,1,2023-01-02,31.0
2,1,2023-01-03,27.0
3,1,2023-01-04,7.0
4,1,2023-01-05,15.0


# File saving

In [17]:
def save_labels(dataset, path):
    labels_df = dataset.to_parquet(path, index=False)

In [18]:
save_labels(labels_df, OUTPUT_PATH)