In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df.head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,4,17.0
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,4,9.0
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,0.0,2022-01-31,4,3.0
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,0.0,1.0,3.0,2022-02-01,4,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,0.0,4.0,4.0,3.0,2022-02-02,4,1.0


In [3]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df=df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour',
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32226, 674)
y_train.shape=(32226,)
X_test.shape=(47946, 674)
y_test.shape=(47946,)


In [4]:
# Create first baseline model
class BaselineModelPreviousHour():
    """
    Prediction for target hour = demand from previous hour.
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> pd.Series:
        return X_test['rides_previous_1_hour']

In [5]:
model = BaselineModelPreviousHour()

predictions = model.predict(X_test)
predictions

0         0.0
1         5.0
2        13.0
3        12.0
4        14.0
         ... 
47941     0.0
47942     0.0
47943     0.0
47944     0.0
47945     0.0
Name: rides_previous_1_hour, Length: 47946, dtype: float32

In [6]:
# Check predictions against real values for baseline number 1
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae=:.4f}")

test_mae=6.1138


In [7]:
# Create baseline model 2

class BaselineModelPreviousWeek:
    """
    Prediction for target hour t = demand observed at t - 7 days
    e.g. if we want to predict Friday at 6pm, we look at Friday at 6pm the previous week.
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> pd.Series:
        return X_test[f'rides_previous_{7*24}_hour']

In [8]:
model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)

test_mae_baseline_2 = mean_absolute_error(y_test, predictions)
print(f"{test_mae_baseline_2=:.4f}")

test_mae_baseline_2=3.4420


In [9]:
# Create baseline model 3
class BaselineModelLast4Weeks:
    """
    Prediction for target hour t = average of demand observed at t - 7 days, t - 14 days, t - 21 days and t - 28 days
    e.g. if we want to predict Friday at 6pm, we look at Friday at 6pm for the previous four weeks and take the average.
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> pd.Series:
        return 0.25 * (
            X_test[f'rides_previous_{7*24*1}_hour'] + \
            X_test[f'rides_previous_{7*24*2}_hour'] + \
            X_test[f'rides_previous_{7*24*3}_hour'] + \
            X_test[f'rides_previous_{7*24*4}_hour']
        )

In [11]:
model = BaselineModelLast4Weeks()
predictions = model.predict(X_test)

test_mae_baseline_3 = mean_absolute_error(y_test, predictions)
print(f"{test_mae_baseline_3=:.4f}")

test_mae_baseline_3=3.0108
