# Imports

In [1]:
import datetime
import numpy as np
import pandas as pd

# Config

In [2]:
INPUT_PATH = '../data/datasets/'
OUTPUT_PATH = '../data/labels/labeled.parquet'
START_DATE = '2023-01-01'
END_DATE = '2023-02-01'

In [3]:
# def load_data():
from dateutil.relativedelta import relativedelta  

END_DATE = datetime.date.fromisoformat(str(END_DATE))
START_DATE = datetime.date.fromisoformat(str(END_DATE - relativedelta(months=1)))

filtered_df = pd.read_parquet(INPUT_PATH,filters=[('tpep_pickup_datetime','<',END_DATE),
                            ('tpep_pickup_datetime','>',START_DATE)])
dataset = filtered_df.filter(items=['tpep_pickup_datetime', 'PULocationID'])
dataset['PU_date'] = pd.to_datetime(dataset['tpep_pickup_datetime'].dt.date)

print(dataset.shape)
dataset.head()

(3066725, 3)


Unnamed: 0,tpep_pickup_datetime,PULocationID,PU_date
0,2023-01-01 00:32:10,161,2023-01-01
1,2023-01-01 00:55:08,43,2023-01-01
2,2023-01-01 00:25:04,48,2023-01-01
3,2023-01-01 00:03:48,138,2023-01-01
4,2023-01-01 00:10:29,107,2023-01-01


In [4]:
dataset.tail()

Unnamed: 0,tpep_pickup_datetime,PULocationID,PU_date
3066720,2023-01-31 23:59:23,211,2023-01-31
3066721,2023-01-31 23:59:04,68,2023-01-31
3066722,2023-01-31 23:58:55,114,2023-01-31
3066723,2023-01-31 23:50:15,163,2023-01-31
3066724,2023-01-31 23:49:00,132,2023-01-31


In [7]:
l = labeling(dataset)

In [8]:
l.tail()

Unnamed: 0,Location,Date,Demand
7962,265,2023-01-27,54.0
7963,265,2023-01-28,62.0
7964,265,2023-01-29,57.0
7965,265,2023-01-30,43.0
7966,265,2023-01-31,40.0


In [35]:
t=pd.DataFrame(dataset['PULocationID'].unique(), columns=['Location'])
t['Location']

0      161
1       43
2       48
3      138
4      107
      ... 
252     99
253    245
254     46
255      2
256     58
Name: Location, Length: 257, dtype: int64

In [36]:
print(len(l['Location'].unique()))
p = pd.DataFrame({'Location':l['Location'].unique(),'Date':[END_DATE]*len(l['Location'].unique()), 'Demand':[None]*len(l['Location'].unique())})
new = pd.concat([l,p]).sort_values(['Location', 'Date'], ascending=[True, True])
print(new)

257
      Location                 Date  Demand
0            1  2023-01-01 00:00:00    40.0
1            1  2023-01-02 00:00:00    31.0
2            1  2023-01-03 00:00:00    27.0
3            1  2023-01-04 00:00:00     7.0
4            1  2023-01-05 00:00:00    15.0
...        ...                  ...     ...
7963       265  2023-01-28 00:00:00    62.0
7964       265  2023-01-29 00:00:00    57.0
7965       265  2023-01-30 00:00:00    43.0
7966       265  2023-01-31 00:00:00    40.0
256        265           2023-02-01     NaN

[8224 rows x 3 columns]


In [42]:
pd.to_datetime(new['Date']).dt.dayofweek

0       6
1       0
2       1
3       2
4       3
       ..
7963    5
7964    6
7965    0
7966    1
256     2
Name: Date, Length: 8224, dtype: int64

In [38]:
print(type(pd.to_datetime(new['Date'])))

<class 'pandas.core.series.Series'>


In [26]:
new['Date'] = pd.to_datetime(new['Date'])

In [44]:
new[].isna().sum()

Location      0
Date          0
Demand      257
dtype: int64

In [21]:
def feature_engineering(dataset):
    dataset['Previous_day_demand'] = dataset.groupby(['Location'])['Demand'].shift(1)
    dataset['Previous_week_demand'] = dataset.groupby(['Location'])['Demand'].shift(7)
    dataset['Day_of_week'] = dataset['Date'].dt.dayofweek   
    dataset['Day_of_month'] = dataset['Date'].dt.day
    return dataset

In [27]:
f = feature_engineering(new)

In [31]:
f[f['Date']==pd.to_datetime(END_DATE)]

Unnamed: 0,Location,Date,Demand,Previous_day_demand,Previous_week_demand,Day_of_week,Day_of_month
0,1,2023-02-01,,8.0,7.0,2,1
1,2,2023-02-01,,0.0,0.0,2,1
2,3,2023-02-01,,0.0,3.0,2,1
3,4,2023-02-01,,60.0,67.0,2,1
4,5,2023-02-01,,2.0,1.0,2,1
...,...,...,...,...,...,...,...
252,261,2023-02-01,,365.0,404.0,2,1
253,262,2023-02-01,,1547.0,1695.0,2,1
254,263,2023-02-01,,1891.0,2166.0,2,1
255,264,2023-02-01,,1499.0,1148.0,2,1


In [47]:
f.isna().sum()

Location                   0
Date                       0
Demand                   257
Previous_day_demand      257
Previous_week_demand    1799
Day_of_week                0
Day_of_month               0
dtype: int64

# Loading Dataset

In [3]:
def load_data(path, start_date: str, end_date: str):
    df = pd.read_parquet(path) 
    start_date = datetime.date.fromisoformat(start_date)
    end_date = datetime.date.fromisoformat(end_date)
    filtered_df = df[(df['tpep_pickup_datetime'].dt.date >= start_date) &
                     (df['tpep_pickup_datetime'].dt.date <= end_date)]
    dataset = filtered_df.filter(items=['tpep_pickup_datetime', 'PULocationID'])
    dataset['PU_date'] = pd.to_datetime(dataset['tpep_pickup_datetime'].dt.date)
    return dataset

In [4]:
rides_df = load_data(INPUT_PATH, START_DATE, END_DATE)
print(f'rides_df shape : {rides_df.shape}')
rides_df.head()

rides_df shape : (12672629, 3)


Unnamed: 0,tpep_pickup_datetime,PULocationID,PU_date
0,2023-01-01 00:32:10,161,2023-01-01
1,2023-01-01 00:55:08,43,2023-01-01
2,2023-01-01 00:25:04,48,2023-01-01
3,2023-01-01 00:03:48,138,2023-01-01
4,2023-01-01 00:10:29,107,2023-01-01


# Labeling

In [6]:
def labeling(dataset):
    dataset_labels = (
        dataset
        .groupby(['PULocationID', 'PU_date'])['PU_date']
        .count()
        .to_frame('Demand')
        .sort_values(['PULocationID', 'PU_date'], ascending=[True, True])
        .reset_index()
        .rename(columns={'PULocationID': 'Location', 'PU_date': 'Date'})
    )
  
    locations = pd.DataFrame(dataset_labels['Location'].unique(), columns=['Location'])
    dates = pd.DataFrame(dataset_labels['Date'].unique(), columns=['Date'])
    
    location_date_df = (
        locations
        .merge(dates, how='cross')
        .sort_values(['Location', 'Date'], ascending=[True, True])
        .reset_index(drop=True)
    )
    
    labels_df = (
        location_date_df
        .merge(dataset_labels, how='left', on=['Location', 'Date'])
        .fillna(value=0)
    )
    
    return labels_df

In [6]:
labels_df = labeling(rides_df)
print(f'labels_df shape : {labels_df.shape}')
labels_df.head()

labels_df shape : (31440, 3)


Unnamed: 0,Location,Date,Demand
0,1,2023-01-01,40.0
1,1,2023-01-02,31.0
2,1,2023-01-03,27.0
3,1,2023-01-04,7.0
4,1,2023-01-05,15.0


# File saving

In [7]:
def save_labels(dataset, path):
    labels_df = dataset.to_parquet(path, index=False)

In [8]:
save_labels(labels_df, OUTPUT_PATH)