# Imports

In [12]:
import datetime
import numpy as np
import pandas as pd

# Config

In [13]:
INPUT_PATH = 'rides/'
OUTPUT_PATH = 'data/labels.parquet'
START_DATE = '2023-01-01'
END_DATE = '2023-04-30'

# Loading Dataset

In [14]:
def load_data(path, start_date: str, end_date: str):
    df = pd.read_parquet(path) 
    start_date = datetime.date.fromisoformat(start_date)
    end_date = datetime.date.fromisoformat(end_date)
    filtered_df = df[(df['tpep_pickup_datetime'].dt.date >= start_date) &
                     (df['tpep_pickup_datetime'].dt.date <= end_date)]
    dataset = filtered_df.filter(items=['tpep_pickup_datetime', 'PULocationID'])
    dataset['PU_date'] = pd.to_datetime(dataset['tpep_pickup_datetime'].dt.date)
    return dataset

In [15]:
rides_df = load_data(INPUT_PATH, START_DATE, END_DATE)
print(f'rides_df shape : {rides_df.shape}')
rides_df.head()

rides_df shape : (12672629, 3)


Unnamed: 0,tpep_pickup_datetime,PULocationID,PU_date
0,2023-01-01 00:32:10,161,2023-01-01
1,2023-01-01 00:55:08,43,2023-01-01
2,2023-01-01 00:25:04,48,2023-01-01
3,2023-01-01 00:03:48,138,2023-01-01
4,2023-01-01 00:10:29,107,2023-01-01


# Labeling

In [21]:
def extract_hours(dataset):
    dataset = dataset.sort_values('tpep_pickup_datetime')
    dataset['hour'] = dataset['tpep_pickup_datetime'].dt.hour
    
    bins = [-1, 3, 6, 9, 12, 15, 18, 21, 24]
    labels = ['0-3', '3-6', '6-9', '9-12', '12-15', '15-18', '18-21', '21-24']
    dataset['time_of_day'] = pd.cut(dataset['hour'], bins=bins, labels=labels, include_lowest=True)
    dataset = dataset.reset_index(drop = True)
    return dataset


In [22]:
def labeling(dataset):
    dataset = extract_hours(dataset)
    
    dataset_labels = (
        dataset
        .groupby(['PULocationID', 'PU_date', 'time_of_day'])['time_of_day']
        .count()
        .to_frame('Demand')
        .sort_values(['PULocationID', 'PU_date', 'time_of_day'], ascending=[True, True, True])
        .reset_index()
        .rename(columns={'PULocationID': 'Location', 'PU_date': 'Date', 'time_of_day' : 'Time'})
    )

    locations = pd.DataFrame(dataset_labels['Location'].unique(), columns=['Location'])
    dates = pd.DataFrame(dataset_labels['Date'].unique(), columns=['Date'])
    times = pd.DataFrame(dataset_labels['Time'].unique(), columns=['Time'])

    location_date_time_df = (
        locations
        .merge(dates, how='cross')
        .merge(times, how='cross')
        .sort_values(['Location', 'Date', 'Time'], ascending=[True, True, True])
        .reset_index(drop=True)
    )

    labels_df = (
        location_date_time_df
        .merge(dataset_labels, how='left', on=['Location', 'Date', 'Time'])
        .fillna(value=0)
    )

    return labels_df

In [23]:
labels_df = labeling(rides_df)
print(f'labels_df shape : {labels_df.shape}')
labels_df.head()

labels_df shape : (251520, 4)


Unnamed: 0,Location,Date,Time,Demand
0,1,2023-01-01,0-3,0
1,1,2023-01-01,3-6,1
2,1,2023-01-01,6-9,1
3,1,2023-01-01,9-12,4
4,1,2023-01-01,12-15,18


# File saving

In [19]:
def save_labels(dataset, path):
    labels_df = dataset.to_parquet(path, index=False)

In [20]:
save_labels(labels_df, OUTPUT_PATH)