## No external data + xgboost + cosine - 0.76

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import FunctionTransformer, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

# Constants
problem_title = "Bike count prediction"
_target_column_name = "log_bike_count"

# Function to read data
def _read_data(path, f_name, is_train=True):
    data = pd.read_parquet(os.path.join(path, "input/msdb-2023/", f_name))
    data = data.sort_values(["date", "counter_name"])
    
    if is_train:
        y_array = data[_target_column_name].values
        X_df = data.drop([_target_column_name, "bike_count"], axis=1)
        return X_df, y_array
    else:
        X_df = data
        return X_df

# Get train and test data
def get_train_data(path="."):
    f_name = "train.parquet"
    return _read_data(path, f_name, is_train=True)

def get_test_data(path="."):
    f_name = "final_test.parquet"
    return _read_data(path, f_name, is_train=False)

# Load the train and test data
X_train, y_train = get_train_data()
X_test = get_test_data()

# Date encoding with cyclic hour feature
def _encode_dates(X):
    X = X.copy()
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour_sin"] = np.sin(2 * np.pi * X["date"].dt.hour/23.0)
    X.loc[:, "hour_cos"] = np.cos(2 * np.pi * X["date"].dt.hour/23.0)

    return X.drop(columns=["date"])

date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

# Preprocessing
categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

# Model
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror')

pipeline = make_pipeline(date_encoder, preprocessor, xgb_regressor)
pipeline.fit(X_train, y_train)


In [4]:
y_pred = pipeline.predict(X_test)
results_dict = {'Id': X_test.index.tolist(), 'log_bike_count': y_pred}

# Convert the dictionary to a DataFrame
results_df = pd.DataFrame(results_dict)

# Save to CSV
results_df.to_csv("submission.csv", index=False)


In [3]:
from sklearn.metrics import mean_squared_error

print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)

Train set, RMSE=0.52


## Weather data + xgboost + cosine

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb  # Import XGBoost
from sklearn.pipeline import FunctionTransformer
from sklearn.preprocessing import StandardScaler

problem_title = "Bike count prediction"
_target_column_name = "log_bike_count"

def get_cv(X, y, random_state=0):
    cv = TimeSeriesSplit(n_splits=8)
    rng = np.random.RandomState(random_state)

    for train_idx, test_idx in cv.split(X):
        yield train_idx, rng.choice(test_idx, size=len(test_idx) // 3, replace=False)

def _read_data(path, f_name, is_train=True):
    # Change the file reading method to use pd.read_csv for CSV files
    data = pd.read_parquet(os.path.join(path, f_name))
    if 'date' in data.columns:
        data['date'] = pd.to_datetime(data['date'])
    data = data.sort_values(["date", "counter_name"])
    
    if is_train:
        y_array = data[_target_column_name].values
        X_df = data.drop([_target_column_name, "bike_count"], axis=1)
        return X_df, y_array
    else:
        X_df = data
        return X_df


def get_train_data(path="."):
    f_name = "train.parquet" 
    return _read_data(path, f_name, is_train=True)

def get_test_data(path="."):
    f_name = "final_test.parquet" 
    return _read_data(path, f_name, is_train=False)

# Loading the train and test data
X_train, y_train = get_train_data()
X_test = get_test_data()
X_test = X_test.drop(columns=['coordinates'])
X_test.head()


Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,counter_technical_id,latitude,longitude
17081,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2021-09-10 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233
18655,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2021-09-10 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233
3124,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2021-09-10 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702
4147,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2021-09-10 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702
48210,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2021-09-10 01:00:00,2020-07-22,Y2H20073268,48.88529,2.32666


In [2]:
def _encode_dates(X):
    X = X.copy()
    #X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month_sin"] = np.sin(2 * np.pi * X["date"].dt.month / 12.0)
    X.loc[:, "month_cos"] = np.cos(2 * np.pi * X["date"].dt.month / 12.0)
    #X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour_sin"] = np.sin(2 * np.pi * X["date"].dt.hour/23.0)
    X.loc[:, "hour_cos"] = np.cos(2 * np.pi * X["date"].dt.hour/23.0)

    # weekend encoding
    X.loc[:, "is_weekend"] = (X["date"].dt.weekday >= 5).astype(int)


    return X.drop(columns=["date"])

def _merge_external_data(X):
    file_path = "hourly-weather-data.csv"  
    df_ext = pd.read_csv(file_path, parse_dates=["date"])

    # Remove rows with null 'date' in the external data
    X['date'] = X['date'].astype('datetime64[ns]')
    df_ext['date'] = df_ext['date'].astype('datetime64[ns]')
    df_ext = df_ext.dropna(subset=['date'])

    X = X.copy()
    X["orig_index"] = np.arange(X.shape[0])
    X = pd.merge_asof(X.sort_values("date"), df_ext.sort_values("date"), on="date")
    X = X.sort_values("orig_index")
    del X["orig_index"]
    return X



In [3]:
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

numerical_cols = ['feelslike']

data_merger = FunctionTransformer(_merge_external_data, validate=False)


preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("standard scaler", StandardScaler(), numerical_cols)
    ]
)

regressor = xgb.XGBRegressor(learning_rate=0.1, max_depth=10, min_child_weight=12,n_estimators=100)  # Use XGBRegressor with objective

pipeline = Pipeline(
    steps=[
        ('merge external data', data_merger),
        ('date encoder', date_encoder),
        ('preprocessor', preprocessor),
        ('regressor',regressor)
    ]
)
pipeline.fit(X_train, y_train)

ValueError: Missing column provided to 'parse_dates': 'date'

In [11]:
y_pred = pipeline.predict(X_test)
results_dict = {'Id': X_test.index.tolist(), 'log_bike_count': y_pred}

# Convert the dictionary to a DataFrame
results_df = pd.DataFrame(results_dict)

# Save to CSV
results_df.to_csv("submission.csv", index=False)

In [10]:
from sklearn.metrics import mean_squared_error

print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)

Train set, RMSE=0.54


## All external Data + xgboost

In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn.pipeline import FunctionTransformer
from sklearn.preprocessing import StandardScaler
# from lightgbm import LGBMRegressor


problem_title = "Bike count prediction"
_target_column_name = "log_bike_count"

def get_cv(X, y, random_state=0):
    cv = TimeSeriesSplit(n_splits=8)
    rng = np.random.RandomState(random_state)

    for train_idx, test_idx in cv.split(X):
        yield train_idx, rng.choice(test_idx, size=len(test_idx) // 3, replace=False)

def _read_data(path, f_name, is_train=True):
    # Change the file reading method to use pd.read_csv for CSV files
    data = pd.read_parquet(os.path.join(path, f_name))
    if 'date' in data.columns:
        data['date'] = pd.to_datetime(data['date'])
    data = data.sort_values(["date", "counter_name"])
    
    if is_train:
        y_array = data[_target_column_name].values
        X_df = data.drop([_target_column_name, "bike_count"], axis=1)
        return X_df, y_array
    else:
        X_df = data
        return X_df


def get_train_data(path="."):
    f_name = "train.parquet" 
    return _read_data(path, f_name, is_train=True)

def get_test_data(path="."):
    f_name = "final_test.parquet" 
    return _read_data(path, f_name, is_train=False)

# Loading the train and test data
X_train, y_train = get_train_data()
X_test = get_test_data()
X_test = X_test.drop(columns=['coordinates'])
X_test.head()


Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,counter_technical_id,latitude,longitude
17081,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2021-09-10 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233
18655,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2021-09-10 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233
3124,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2021-09-10 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702
4147,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2021-09-10 01:00:00,2017-07-12,Y2H19027732,48.85372,2.35702
48210,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2021-09-10 01:00:00,2020-07-22,Y2H20073268,48.88529,2.32666


In [5]:
def _encode_dates(X):
    X = X.copy()
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday

    # weekend encoding
    X.loc[:, "is_weekend"] = (X["date"].dt.weekday >= 5).astype(int)

    # cosine encodings to capture cyclical patterns
    # months
    X.loc[:, "month_sin"] = np.sin(2 * np.pi * X["date"].dt.month / 12.0)
    X.loc[:, "month_cos"] = np.cos(2 * np.pi * X["date"].dt.month / 12.0)
    # hours
    X.loc[:, "hour_sin"] = np.sin(2 * np.pi * X["date"].dt.hour / 23.0)
    X.loc[:, "hour_cos"] = np.cos(2 * np.pi * X["date"].dt.hour / 23.0)

    # Rush hour for weekdays (Monday=0, Sunday=6)
    X.loc[:, "morning_rush"] = ((X["weekday"] < 5) & (X["date"].dt.hour >= 7) & (X["date"].dt.hour <= 9)).astype(int)
    X.loc[:, "evening_rush"] = ((X["weekday"] < 5) & (X["date"].dt.hour >= 16) & (X["date"].dt.hour <= 18)).astype(int)
    
    return X.drop(columns=["date"])



def _merge_external_data(X):
    file_path = "all-ext-data.csv"
    df_ext = pd.read_csv(file_path, parse_dates=["date"])

    # Remove rows with null 'date' in the external data
    X['date'] = X['date'].astype('datetime64[ns]')
    df_ext['date'] = df_ext['date'].astype('datetime64[ns]')
    df_ext = df_ext.dropna(subset=['date'])

    X = X.copy()
    X["orig_index"] = np.arange(X.shape[0])
    X = pd.merge_asof(X.sort_values("date"), df_ext.sort_values("date"), on="date")
    X = X.sort_values("orig_index")
    del X["orig_index"]
    return X



In [24]:
ext_data = pd.read_csv("all-ext-data.csv", parse_dates=["date"])
ext_data.head()

Unnamed: 0,date,feelslike,humidity,precip,conditions,is_holiday,full_lockdown
0,2020-09-19 19:00:00,23.8,52.82,0.0,Overcast,0,0
1,2020-09-19 19:00:00,23.8,52.82,0.0,Overcast,0,0
2,2020-09-19 19:00:00,23.8,52.82,0.0,Overcast,0,0
3,2020-09-19 19:00:00,23.8,52.82,0.0,Overcast,0,0
4,2020-09-19 19:00:00,23.8,52.82,0.0,Overcast,0,0


In [7]:
# merge the datasets
date_encoder = FunctionTransformer(_encode_dates, validate=False)
data_merger = FunctionTransformer(_merge_external_data, validate=False)

# date columns
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

# categorical columns
categorical_cols = ['counter_name', 'site_name', 'is_holiday']

# numerical columns
numerical_cols = ['feelslike', 'precip']

# preprocess these features
preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("categories", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("standard scaler", StandardScaler(), numerical_cols),
    ]
)

regressor = xgb.XGBRegressor(learning_rate=0.5, max_depth=8, min_child_weight=16, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.7500000000000001, verbosity=0)
#regressor = LGBMRegressor(learning_rate=0.5, max_depth=8, n_estimators=100)


# define the pipeline
pipeline = Pipeline(
    steps=[
        ('merge external data', data_merger),
        ('date encoder', date_encoder),
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ]
)

# fit the training data
pipeline.fit(X_train, y_train)

In [8]:
from sklearn.metrics import mean_squared_error

print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)

Train set, RMSE=0.39


In [42]:
y_pred = pipeline.predict(X_test)
results_dict = {'Id': X_test.index.tolist(), 'log_bike_count': y_pred}

# Convert the dictionary to a DataFrame
results_df = pd.DataFrame(results_dict)

# Save to CSV
results_df.to_csv("submission.csv", index=False)