# 09 | LightGBM Model with Feature Engineering #
## Introduction ##
In this notebook, in addition to the feature engineering that we created in the first version of notebook #9, we will create a new feature. We will replace the pickup location id, which is a categorical variable, with **centroids** for each location id. The goal is to provide a more meaningful value, hoping to improve the accuracy of our model.

## Load the training data ##

In [46]:
# Always reload our imports to make sure they reflect changes.
%reload_ext autoreload
%autoreload 2

In [47]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR
from datetime import datetime
from src.data_split import train_test_split
from src.convert_shp_coords import create_centroid_coords, convert_gdf_to_dataframe, merge_coords_df_with_training_df
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline

In [48]:
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89300,3.0,0.0,2.0,3.0,2.0,3.0,13.0,8.0,9.0,9.0,...,6.0,5.0,3.0,1.0,6.0,1.0,3.0,2022-12-27,265,3.0
89301,6.0,4.0,0.0,0.0,2.0,0.0,14.0,7.0,8.0,4.0,...,4.0,2.0,1.0,2.0,2.0,2.0,8.0,2022-12-28,265,1.0
89302,7.0,2.0,3.0,4.0,7.0,4.0,10.0,9.0,7.0,11.0,...,2.0,3.0,5.0,1.0,1.0,0.0,8.0,2022-12-29,265,3.0
89303,6.0,5.0,4.0,3.0,0.0,3.0,11.0,12.0,9.0,10.0,...,3.0,3.0,1.0,2.0,0.0,1.0,2.0,2022-12-30,265,7.0


We can now import our functions, which will:
- Import a `shp` file for NY containing the `location_id`, as well as its geometries.
- Convert the geometry to the appropriate Coordinate Reference System (CRS).
- Get the centroid for each geometry.
- Reproject back the centroids to geographic CRS (`EPSG 4326`).
- Extract the latitude and longitude of the centroids.
- Return a `DataFrame` containing the `location_id`, the `latitude` and the `longitude` for each location id.

In [49]:
coordinates = create_centroid_coords()
df_coords = convert_gdf_to_dataframe(coordinates)
df_coords

Unnamed: 0,location_id,latitude,longitude
0,1.0,40.691830,-74.174002
1,2.0,40.616746,-73.831300
2,3.0,40.864474,-73.847422
3,4.0,40.723752,-73.976968
4,5.0,40.552659,-74.188485
...,...,...,...
258,256.0,40.710880,-73.959905
259,259.0,40.897932,-73.852215
260,260.0,40.744234,-73.906307
261,261.0,40.709139,-74.013023


In [50]:
df_coords["latitude"].name

'latitude'

Great! Now, we can merge our result with our training dataset.

In [51]:
df_merged = merge_coords_df_with_training_df(
    training_df=df,
    coord_df=df_coords,
    coord_id="location_id",
    training_id="pickup_location_id"
)

In [52]:
df_merged

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,target_rides_next_hour,latitude,longitude
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,0.0,40.691830,-74.174002
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,0.0,40.691830,-74.174002
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,0.0,40.691830,-74.174002
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,0.0,40.691830,-74.174002
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,0.0,40.691830,-74.174002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88626,17.0,11.0,4.0,1.0,7.0,20.0,58.0,132.0,145.0,123.0,...,83.0,69.0,73.0,64.0,50.0,23.0,2022-12-27,21.0,40.778766,-73.951010
88627,30.0,22.0,5.0,10.0,4.0,23.0,70.0,127.0,117.0,155.0,...,94.0,88.0,65.0,68.0,63.0,39.0,2022-12-28,29.0,40.778766,-73.951010
88628,34.0,18.0,15.0,13.0,9.0,22.0,75.0,111.0,126.0,130.0,...,110.0,93.0,87.0,85.0,92.0,57.0,2022-12-29,43.0,40.778766,-73.951010
88629,57.0,40.0,21.0,16.0,10.0,22.0,72.0,114.0,145.0,152.0,...,118.0,84.0,106.0,84.0,96.0,77.0,2022-12-30,45.0,40.778766,-73.951010


Ok! Let's reproduce the previous notebook now and compare the results.

## Split the data into train and test sets ##

In [54]:
X_train, y_train, X_test, y_test = train_test_split(
    df_merged,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32349, 675)
y_train.shape=(32349,)
X_test.shape=(56282, 675)
y_test.shape=(56282,)


In [18]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    """
    Adds one column with the average rides from:

    - 7 days ago
    - 14 days ago
    - 21 days ago
    - 28 days ago

    Args:

    - `X` (`pd.DataFrame`): Pandas DataFrame with features

    Returns:

    - `pd.DataFrame` with extra column `average_rides_last_4_weeks`
    """
    X["average_rides_last_4_weeks"] = 0.25 * (
        X[f"rides_previous_{7*24}_hour"] + \
        X[f"rides_previous_{2*7*24}_hour"] + \
        X[f"rides_previous_{3*7*24}_hour"] + \
        X[f"rides_previous_{4*7*24}_hour"]   
    )

    return X

In [19]:
add_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

In [20]:
add_average_rides_last_4_weeks.fit_transform(X_train)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,latitude,longitude,average_rides_last_4_weeks
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,40.691830,-74.174002,0.00
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,40.691830,-74.174002,0.00
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,40.691830,-74.174002,0.00
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,40.691830,-74.174002,0.00
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,40.691830,-74.174002,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32344,47.0,22.0,21.0,21.0,25.0,20.0,84.0,113.0,131.0,149.0,...,167.0,170.0,137.0,124.0,107.0,64.0,2022-05-27,40.778766,-73.951010,58.50
32345,127.0,83.0,52.0,33.0,27.0,19.0,33.0,56.0,92.0,131.0,...,142.0,103.0,100.0,82.0,86.0,96.0,2022-05-28,40.778766,-73.951010,121.25
32346,154.0,116.0,62.0,49.0,29.0,16.0,38.0,52.0,59.0,87.0,...,115.0,118.0,95.0,84.0,82.0,92.0,2022-05-29,40.778766,-73.951010,131.25
32347,19.0,15.0,11.0,5.0,10.0,27.0,55.0,77.0,98.0,86.0,...,86.0,88.0,90.0,81.0,76.0,55.0,2022-05-30,40.778766,-73.951010,21.00


In [21]:
class TemporalFeaturesEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):

        X_ = X.copy()

        # Generate numeric columns from datetime
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek

        return X_.drop(columns=["pickup_hour"])

In [22]:
add_temporal_features = TemporalFeaturesEngineer()
add_temporal_features.fit_transform(X_train)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,latitude,longitude,average_rides_last_4_weeks,hour,day_of_week
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,40.691830,-74.174002,0.00,0,5
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,1.0,0.0,0.0,40.691830,-74.174002,0.00,0,6
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,40.691830,-74.174002,0.00,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,0.0,40.691830,-74.174002,0.00,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,40.691830,-74.174002,0.00,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32344,47.0,22.0,21.0,21.0,25.0,20.0,84.0,113.0,131.0,149.0,...,170.0,137.0,124.0,107.0,64.0,40.778766,-73.951010,58.50,0,4
32345,127.0,83.0,52.0,33.0,27.0,19.0,33.0,56.0,92.0,131.0,...,103.0,100.0,82.0,86.0,96.0,40.778766,-73.951010,121.25,0,5
32346,154.0,116.0,62.0,49.0,29.0,16.0,38.0,52.0,59.0,87.0,...,118.0,95.0,84.0,82.0,92.0,40.778766,-73.951010,131.25,0,6
32347,19.0,15.0,11.0,5.0,10.0,27.0,55.0,77.0,98.0,86.0,...,88.0,90.0,81.0,76.0,55.0,40.778766,-73.951010,21.00,0,0


In [23]:
pipeline = make_pipeline(
    add_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)

pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.157997 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 154782
[LightGBM] [Info] Number of data points in the train set: 32349, number of used features: 676
[LightGBM] [Info] Start training from score 11.496182


In [24]:
predictions = pipeline.predict(X_test)


In [25]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae=:.4f}")

test_mae=2.5439


Observtions:
- **Great news**! After some feature engineering, our model  **improved** from `2.5947` to `2.5439`. 
- This is better than our first attempt before applying feature engineering (`2.5776`).
- Feature engineering can be tiresome (it took me some time to figure out the solution), but it can add more context to your model!
