In [11]:
import numpy as np
import pandas as pd
from pathlib import Path
import holidays
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBRegressor

In [12]:
df_train = pd.read_parquet(Path('data') / 'train.parquet')
df_test = pd.read_parquet(Path('data') / 'final_test.parquet')

X_train = df_train.drop(columns=['log_bike_count', 'bike_count'])
y_train = df_train['log_bike_count']
X_test = df_test

In [13]:
def _encode_dates(X):
    X = X.copy()
    X['date'] = pd.to_datetime(X['date'])
    X['month'] = X['date'].dt.month
    X['day'] = X['date'].dt.day
    X['weekday'] = X['date'].dt.weekday
    X['hour'] = X['date'].dt.hour

    fr_holidays = holidays.France(years=[2020, 2021])
    def is_holiday(date):
        weekday = date.weekday()
        if weekday > 4 or date in fr_holidays:
            return 1
        else:
            return 0
    X['is_holiday'] = X['date'].apply(is_holiday)
    return X


def _merge_external_data(X):
    file_path = Path('data') / 'external_data.csv'
    df_ext = pd.read_csv(file_path, parse_dates=['date'])
    X = X.copy()
    X['date'] = X['date'].astype('datetime64[ns]')
    X['orig_index'] = np.arange(X.shape[0])
    cols_to_merge = ['date', 'pmer', 'tend', 'cod_tend', 'tend24',
                                        'dd', 'ff', 't', 'td', 'u', 'vv',  
                                        'n', 'pres', 'raf10', 'ww', 'nbas',
                                        'ht_neige', 'rr1', 'rr6',]
    X = pd.merge_asof(
        X.sort_values('date'), df_ext[cols_to_merge].sort_values('date'), on='date'
    )  
    X = X.sort_values('orig_index')
    del X['orig_index']
    for col in cols_to_merge:
        if X[col].isnull().any():
            X[col].fillna(X[col].mean(), inplace=True)
    return X


def one_hot_encode_and_concat(X):
    categorical_columns = ['counter_id']
    one_hot = OneHotEncoder(handle_unknown='ignore')
    one_hot_encoded_data = one_hot.fit_transform(X[categorical_columns])
    one_hot_encoded_df = pd.DataFrame(one_hot_encoded_data.toarray(), 
            columns=one_hot.get_feature_names_out(categorical_columns))
    X_dropped = X.drop(columns=categorical_columns)
    X_encoded = pd.concat([X_dropped.reset_index(drop=True), one_hot_encoded_df.reset_index(drop=True)], axis=1)
    del X_encoded['date']

    return X_encoded


def _drop_columns(X):
    res = X.copy()
    res = res.drop(columns=['counter_name',
        'coordinates',
        'site_name',
        'site_id',
        'counter_technical_id',
        'counter_installation_date',
        'latitude', 
        'longitude'
        ])
    return res

In [14]:
X_train = _encode_dates(X_train)
X_train = _merge_external_data(X_train)
X_train = one_hot_encode_and_concat(X_train)
X_train = _drop_columns(X_train)

In [None]:
# Finding the best regressor

pipeline = Pipeline([
    ('scaler', 'placeholder'),
    ('regressor', 'placeholder')
])

param_grid_reg = {
    'regressor': [XGBRegressor(), RandomForestRegressor(), ExtraTreesRegressor(), Ridge(), LinearRegression()]
}


# Optimising the hyperparameters of xgboost

regressor = XGBRegressor()

pipeline = Pipeline([
    ('scaler', 'placeholder'),
    ('regressor', regressor)
])

param_grid_xgboost = {
    'scaler': [MinMaxScaler(), StandardScaler(), None],
    'regressor__max_depth': [6, 8, 10],                   # Maximum depth of the trees.
    'regressor__learning_rate': [0.1, 0.15, 0.2],         # Learning rate.
    'regressor__n_estimators': [100, 200, 300],           # Number of trees.
}

grid_search = GridSearchCV(
    pipeline, 
    param_grid_xgboost, 
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    error_score='raise'
)

grid_search.fit(X_train, y_train)

print('\n')
print('Best Parameters Found:')
print(grid_search.best_params_)

print('\n')
print('Best Score: ', grid_search.best_score_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


Best Parameters Found:
{'regressor__learning_rate': 0.15, 'regressor__max_depth': 10, 'regressor__n_estimators': 300, 'scaler': MinMaxScaler()}


Best Score:  -0.96643995818356


In [None]:
regressors = [
    ("XGBRegressor", XGBRegressor()), 
    ("RandomForestRegressor", RandomForestRegressor()), 
    ("ExtraTreesRegressor", ExtraTreesRegressor()), 
    ("Ridge", Ridge()), 
    ("LinearRegression", LinearRegression())
    ]


for name, reg in regressors:
    pipeline = Pipeline([('regressor', reg)])
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    print(f"{name} - MSE: {-np.mean(scores):.3f} ± {np.std(scores):.3f}")