Notebook to explore encoding of temporal features. As we're training a tree-based model, we'll use radial basis functions but if we were using a linear model we could also try cyclical encodings. 

In [1]:
import sys
sys.path.append('../')

import src.train as train
import src.utils.helper as helper

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import set_config
set_config(transform_output = "pandas")

from feature_engine import encoding as ce

from sklego.preprocessing import RepeatingBasisFunction

from xgboost import XGBRegressor


In [2]:
data, discrete, temporal, continuous, categorical = helper.load_base_data()
data.head()

Discrete: ['Latitude', 'Longitude', 'Altitude']
Temporal: ['month', 'hour']
Continuous: ['Humidity', 'AmbientTemp', 'Wind.Speed', 'Visibility', 'Pressure', 'Cloud.Ceiling']
Categorical: ['Location', 'Season']


Unnamed: 0,Location,Latitude,Longitude,Altitude,Season,Humidity,AmbientTemp,Wind.Speed,Visibility,Pressure,Cloud.Ceiling,PolyPwr,month,hour
0,Camp Murray,47.11,-122.57,25.480965,Winter,81.71997,12.86919,8.053964,16.096495,1010.6,22.02899,2.42769,12,11
1,Camp Murray,47.11,-122.57,25.480965,Winter,96.64917,9.66415,0.0,16.096495,1011.3,0.701755,2.46273,12,13
2,Camp Murray,47.11,-122.57,25.480965,Winter,93.61572,15.44983,8.053964,16.096495,1011.6,0.976354,4.46836,12,13
3,Camp Murray,47.11,-122.57,25.480965,Winter,77.21558,10.36659,8.053964,3.219299,1024.4,0.183066,1.65364,12,12
4,Camp Murray,47.11,-122.57,25.480965,Winter,54.80347,16.85471,4.832378,4.828948,1023.7,0.2746,6.57939,12,14


In [3]:
X = data[discrete + temporal + continuous + categorical].copy()
y = data['PolyPwr'].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X,  # predictors
    y,  # target
    train_size=0.8,  # percentage of obs in train set
    random_state=0)  # seed to ensure reproducibility

print(X_train.shape, X_test.shape)
print(X.columns)


(16836, 13) (4209, 13)
Index(['Latitude', 'Longitude', 'Altitude', 'month', 'hour', 'Humidity',
       'AmbientTemp', 'Wind.Speed', 'Visibility', 'Pressure', 'Cloud.Ceiling',
       'Location', 'Season'],
      dtype='object')


The custom validation I've defined with early-stopping doesn't work with gridsearch (since it has its own cross-validation set-up) for tuning the number of basis functions and the widths so we'll import an XGBRegressor explicitly here with the matching default parameters defined in `train.py`, and omit early-stopping while optimising.

In [4]:

pipeline = Pipeline([
    
    # CATEGORICAL
    # Group rare categories
    ('rare_label_encoder', 
     ce.RareLabelEncoder(tol=0.05,
                         n_categories=7,
                         variables=['Location'])),
    
    # # Label encode
    ('categorical_encoder',
     ce.OrdinalEncoder(encoding_method='ordered',
                       variables=['Season'])),
    
    ('categorical_encoder_loc',
     ce.OrdinalEncoder(encoding_method='arbitrary',
                       variables=['Location'])),
    
    # Month encoder
    ('rbf_month',
     RepeatingBasisFunction(remainder="passthrough",
                            n_periods=12,
                            column="month",
                            width=1.0,
                            input_range=(1,12))),
    
    ('xgb', train.build_model())

])

scores = train.train_model(inputs=X_train, target=y_train, pipeline=pipeline)


CV Average R2: 0.6604903303585719


An example of the grid search used to optimise the RBF parameters is given below. 

In [5]:
xgb_params = {'objective': 'reg:squarederror',
              'n_estimators': 200,
              'learning_rate': 0.01,
              }

model = XGBRegressor(**xgb_params, random_state=0)

# Note, RepeatingBasisFunction loses the names of the columns so to access the hour column, we need to 
# use the new column index, hour_column_index.
month_periods = 12
hour_column_index = month_periods + 3

grid_search_pipeline = Pipeline([
    
    # Group rare categories
    ('rare_label_encoder', 
     ce.RareLabelEncoder(tol=0.05,
                         n_categories=7,
                         variables=['Location'])),
    
    # # Label encode
    ('categorical_encoder',
     ce.OrdinalEncoder(encoding_method='ordered',
                       variables=['Season'])),
    
    ('categorical_encoder_loc',
     ce.OrdinalEncoder(encoding_method='arbitrary',
                       variables=['Location'])),
    
    # Month encoder
    ('rbf_month',
     RepeatingBasisFunction(remainder="passthrough",
                            n_periods=12,
                            column="month",
                            width=1.0,
                            input_range=(1,12))),
    
    ('rbf_hour',
     RepeatingBasisFunction(remainder="passthrough",
                            n_periods=24,
                            width=0.5,
                            column=hour_column_index,
                            input_range=(0,23))),
    
    # Model
    ('xgb', model)

])


param_grid = {
    'rbf_hour__n_periods': [22, 24, 26],
    'rbf_hour__width': [0.3, 0.5, 0.7],
}

grid_search = GridSearchCV(grid_search_pipeline, param_grid,
                           cv=5, n_jobs=-1, scoring='r2')

grid_search.fit(X_train, y_train)
print(f"CV Average R2 for Train Set: {grid_search.score(X_train, y_train)}")
print(f"CV Average R2 for Test Set: {grid_search.score(X_test, y_test)}")

print(grid_search.best_params_)


CV Average R2 for Train Set: 0.6538648909487805
CV Average R2 for Test Set: 0.6262885305715482
{'rbf_hour__n_periods': 24, 'rbf_hour__width': 0.3}


### Summary
Tuning the RBFs for the month feature increased the R2 to around 0.6605, with 12 basis functions with widths of 1.0. Doing the same for the hour feature didn't increase the score and so it was left as an ordinal encoding. The pipeline is updated as follows.

In [6]:
pipeline