In [25]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb  # Import XGBoost
from sklearn.pipeline import FunctionTransformer
from sklearn.preprocessing import StandardScaler

problem_title = "Bike count prediction"
_target_column_name = "log_bike_count"

def get_cv(X, y, random_state=0):
    cv = TimeSeriesSplit(n_splits=8)
    rng = np.random.RandomState(random_state)

    for train_idx, test_idx in cv.split(X):
        yield train_idx, rng.choice(test_idx, size=len(test_idx) // 3, replace=False)

def _read_data(path, f_name, is_train=True):
    # Change the file reading method to use pd.read_csv for CSV files
    data = pd.read_parquet(os.path.join(path, f_name))
    if 'date' in data.columns:
        data['date'] = pd.to_datetime(data['date'])
    data = data.sort_values(["date", "counter_name"])
    
    if is_train:
        y_array = data[_target_column_name].values
        X_df = data.drop([_target_column_name, "bike_count"], axis=1)
        return X_df, y_array
    else:
        X_df = data
        return X_df


def get_train_data(path="."):
    f_name = "train.parquet"  # Updated file name
    return _read_data(path, f_name, is_train=True)

def get_test_data(path="."):
    f_name = "final_test.parquet"  # Updated file name
    return _read_data(path, f_name, is_train=False)

# Loading the train and test data
X_train, y_train = get_train_data()
X_test = get_test_data()
X_test = X_test.drop(columns=['coordinates'])


def _encode_dates(X):
    X = X.copy()
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour_sin"] = np.sin(2 * np.pi * X["date"].dt.hour/23.0)
    X.loc[:, "hour_cos"] = np.cos(2 * np.pi * X["date"].dt.hour/23.0)

    return X.drop(columns=["date"])

def _merge_external_data(X):
    file_path = "external_data.csv"  # Update with the correct path if needed
    df_ext = pd.read_csv(file_path, parse_dates=["date"])

    # Remove rows with null 'date' in the external data
    X['date'] = X['date'].astype('datetime64[ns]')
    df_ext['date'] = df_ext['date'].astype('datetime64[ns]')
    #df_ext = df_ext.dropna(subset=['date'])

    X = X.copy()
    X["orig_index"] = np.arange(X.shape[0])
    X = pd.merge_asof(X.sort_values("date"), df_ext.sort_values("date"), on="date")
    X = X.sort_values("orig_index")
    del X["orig_index"]
    return X


date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

data_merger = FunctionTransformer(_merge_external_data, validate=False)


preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("standard scaler", StandardScaler(), ['u', 't'])
    ]
)

regressor = xgb.XGBRegressor(objective='reg:squarederror')  # Use XGBRegressor with objective

pipeline = Pipeline(
    steps=[
        ('merge external data', data_merger),
        ('date encoder', date_encoder),
        ('preprocessor', preprocessor),
        ('regressor',regressor)
    ]
)
pipeline.fit(X_train, y_train)



In [26]:
# Process the test data (apply date encoding and other preprocessing steps)
y_pred = pipeline.predict(X_test)

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'Id': X_test.index,  # Using the DataFrame index as the identifier
    'log_bike_count': y_pred
})

# Save to CSV
submission_df.to_csv("submission.csv", index=False)

In [28]:
y_pred = pipeline.predict(X_test)
results_dict = {'Id': X_test.index.tolist(), 'log_bike_count': y_pred}

# Convert the dictionary to a DataFrame
results_df = pd.DataFrame(results_dict)

# Save to CSV
results_df.to_csv("submission.csv", index=False)


In [27]:
from sklearn.metrics import mean_squared_error

print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)

Train set, RMSE=0.51
