## No external data + xgb + cosine - RMSE 0.76

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import FunctionTransformer, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

# Constants
problem_title = "Bike count prediction"
_target_column_name = "log_bike_count"

# Function to read data
def _read_data(path, f_name, is_train=True):
    data = pd.read_parquet(os.path.join(path, "input/msdb-2023/", f_name))
    data = data.sort_values(["date", "counter_name"])
    
    if is_train:
        y_array = data[_target_column_name].values
        X_df = data.drop([_target_column_name, "bike_count"], axis=1)
        return X_df, y_array
    else:
        X_df = data
        return X_df

# Get train and test data
def get_train_data(path="."):
    f_name = "train.parquet"
    return _read_data(path, f_name, is_train=True)

def get_test_data(path="."):
    f_name = "final_test.parquet"
    return _read_data(path, f_name, is_train=False)

# Load the train and test data
X_train, y_train = get_train_data()
X_test = get_test_data()

# Date encoding with cyclic hour feature
def _encode_dates(X):
    X = X.copy()
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour_sin"] = np.sin(2 * np.pi * X["date"].dt.hour/23.0)
    X.loc[:, "hour_cos"] = np.cos(2 * np.pi * X["date"].dt.hour/23.0)

    return X.drop(columns=["date"])

date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

# Preprocessing
categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

# Model
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror')

pipeline = make_pipeline(date_encoder, preprocessor, xgb_regressor)
pipeline.fit(X_train, y_train)


In [4]:
y_pred = pipeline.predict(X_test)
results_dict = {'Id': X_test.index.tolist(), 'log_bike_count': y_pred}

# Convert the dictionary to a DataFrame
results_df = pd.DataFrame(results_dict)

# Save to CSV
results_df.to_csv("submission.csv", index=False)


In [3]:
from sklearn.metrics import mean_squared_error

print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)

Train set, RMSE=0.52
