In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')

from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df=df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour',
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32226, 674)
y_train.shape=(32226,)
X_test.shape=(47946, 674)
y_test.shape=(47946,)


In [3]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyperparameters, train a model and compute an average validation error based
    on a TimeSeriesSplit
    """
    
    # Pick hyperparameters
    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_fraction", 3, 100),
    }
    
    tss = TimeSeriesSplit(n_splits=4)
    scores = []
    for train_index, val_index in tss.split(X_train):
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # Evaluate model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)
        
        scores.append(mae)
        
    return np.array(scores).mean()

In [4]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2024-09-21 17:10:38,053] A new study created in memory with name: no-name-c8bfcaef-3de1-4916-b23c-3f9b16794adc
[I 2024-09-21 17:10:42,075] Trial 0 finished with value: 1.8631223553153984 and parameters: {'num_leaves': 7, 'feature_fraction': 0.5152819434578803, 'bagging_fraction': 0.7458606184069176, 'min_child_fraction': 94}. Best is trial 0 with value: 1.8631223553153984.
[I 2024-09-21 17:11:11,203] Trial 1 finished with value: 1.5730572619105745 and parameters: {'num_leaves': 228, 'feature_fraction': 0.5663913890498459, 'bagging_fraction': 0.7829571820718311, 'min_child_fraction': 5}. Best is trial 1 with value: 1.5730572619105745.
[I 2024-09-21 17:11:32,951] Trial 2 finished with value: 1.4458968760628483 and parameters: {'num_leaves': 252, 'feature_fraction': 0.8464807790248255, 'bagging_fraction': 0.39283657343889633, 'min_child_fraction': 88}. Best is trial 2 with value: 1.4458968760628483.
[I 2024-09-21 17:11:45,862] Trial 3 finished with value: 1.5853921294851239 and paramet

In [5]:
best_params = study.best_trial.params
print(f"{best_params=}")

best_params={'num_leaves': 252, 'feature_fraction': 0.8464807790248255, 'bagging_fraction': 0.39283657343889633, 'min_child_fraction': 88}


In [6]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [7]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae=}")

test_mae=np.float64(2.5080314123508187)


In [13]:
from src.plot import plot_one_sample

plot_one_sample(
    features=X_test,
    targets=y_test,
    example_id=3979,
    predictions=pd.Series(predictions)
)