In [60]:
import pandas as pd
import numpy as np

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.ensemble import GradientBoostingRegressor

from xgboost import XGBRegressor

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [61]:
bike = pd.read_csv("../data/train.csv", index_col=0, parse_dates=True)
biketest = pd.read_csv("../data/test.csv", index_col=0, parse_dates=True)

In [62]:
bike["year"] = bike.index.year
bike["month"] = bike.index.month
bike["weekday"] = bike.index.weekday #from 0-6
bike["hour"] = bike.index.hour

biketest["year"] = biketest.index.year
biketest["month"] = biketest.index.month
biketest["weekday"] = biketest.index.weekday #from 0-6
biketest["hour"] = biketest.index.hour

print(list(bike.columns))

['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count', 'year', 'month', 'weekday', 'hour']


In [63]:
X = bike[['temp', "humidity", 'windspeed', 'year', 'month','weekday', 'hour']]
y = bike['count']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42)

ytrainlog = np.log1p(ytrain)
ytestlog= np.log1p(ytest)

In [64]:
sclr = MinMaxScaler()

ohe_pipe = Pipeline([
    ("ohe", OneHotEncoder(sparse_output=False, drop="first")),
])

pol4_scl_pipe = Pipeline([
    ("poly", PolynomialFeatures(degree=4)),
    ("scl", sclr),
])

pol2_scl_pipe = Pipeline([
    ("poly", PolynomialFeatures(degree=2)),
    ("scl", sclr),
])

In [65]:
print(list(Xtrain.columns))

['temp', 'humidity', 'windspeed', 'year', 'month', 'weekday', 'hour']


In [66]:
ct = ColumnTransformer(
    [
        ("ohe", ohe_pipe, ["year"]),
        #("poly2_scl", pol2_scl_pipe, ['temp', 'month']),
        #("poly4_scl", pol4_scl_pipe, ['hour']),
        ("poly_scl", pol4_scl_pipe, ['temp', "humidity", 'windspeed', 'month', 'weekday', 'hour']),
    ],
    remainder=sclr
).set_output(transform="pandas")

In [67]:
%%script echo skipping

gbr = GradientBoostingRegressor(verbose=True, loss="absolute_error", max_features=None, max_depth=4, n_estimators=200)

ctplusmodel = Pipeline(
    [
        ("ct", ct),
        ("model", gbr),
    ]
)

skipping


In [68]:
%%script echo skipping

ctplusmodel.fit(Xtrain, ytrainlog)

skipping


In [69]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [70]:
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [71]:
%%script echo skipping

print("train r^2 = ",round(ctplusmodel.score(Xtrain, ytrainlog), 3))
print("test r^2  = ",round(ctplusmodel.score(Xtest, ytestlog), 3))
print()
print("train rmsle = ",round(rmsle(ytrain, np.expm1(ctplusmodel.predict(Xtrain))), 3))
print("test rmsle  = ",round(rmsle(ytest, np.expm1(ctplusmodel.predict(Xtest))), 3))

skipping


In [72]:
Xtrain_fe = ct.fit_transform(Xtrain)
Xtest_fe = ct.transform(Xtest)
X_fe = ct.fit_transform(X)
ylog = np.log1p(y)

In [73]:
%%script echo skip

maxthreads = 12
threads = 6

xgb_model = XGBRegressor(
        #n_jobs=maxthreads//threads,
        tree_method="gpu_hist",
        objective="reg:squaredlogerror",
    )

clf = GridSearchCV(
        xgb_model,
        {"max_depth": [4, 5, 6], "n_estimators": [50, 100, 200]},
        scoring="neg_mean_squared_log_error",
        verbose=3,
        #n_jobs=threads,
        #cv=5,
    )

clf.fit(Xtrain_fe, ytrainlog)
print()
print(clf.best_score_)
print(clf.best_params_)

skip


In [74]:
xgb_final_model = XGBRegressor(
    tree_method="gpu_hist",
    objective="reg:squaredlogerror",
    max_depth=5,
    n_estimators=500,
)

xgb_final_model.fit(Xtrain_fe, ytrainlog);

print("train rmsle = ",round(rmsle(ytrain, np.expm1(xgb_final_model.predict(Xtrain_fe))), 3))
print("test rmsle  = ",round(rmsle(ytest, np.expm1(xgb_final_model.predict(Xtest_fe))), 3))

train rmsle =  0.09
test rmsle  =  0.334


In [75]:
xgb_final_model.fit(X_fe, ylog);

print("rmsle = ",round(rmsle(y, np.expm1(xgb_final_model.predict(X_fe))), 3))

rmsle =  0.106


In [76]:
superX = biketest[X.columns]
superX_fe = ct.transform(superX)

submission = pd.DataFrame()

submission["datetime"] = superX.index
submission["count"] = np.expm1(xgb_final_model.predict(superX_fe))

In [77]:
submission

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,9.473688
1,2011-01-20 01:00:00,3.758754
2,2011-01-20 02:00:00,3.030836
3,2011-01-20 03:00:00,2.720711
4,2011-01-20 04:00:00,1.512363
...,...,...
6488,2012-12-31 19:00:00,329.979919
6489,2012-12-31 20:00:00,240.685211
6490,2012-12-31 21:00:00,141.417130
6491,2012-12-31 22:00:00,104.800705


In [78]:
#submission.to_csv("../data/bike_pred_v10.csv", index=False)