In [1]:
import sys
# Add the path to the 'src' directory, not the 'src/utils.py' file
sys.path.append('../src')  # This adds the 'src' directory to the sys.path

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from catboost import CatBoostRegressor

from utils import get_absolute_path, eval_metrics, ColumnsOneHotEncoder



# 1. Dataset

In [2]:
FEATURES = ['percentage_docks_available', 'station_id', 'post_code', 'altitude', 'laboral_day', 'weekday', 'month', 'day', 'hour', 'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1']

DATASET = pd.read_csv(get_absolute_path('../data/processed/groupby/stations_final_2023.csv'))

# 2. Pipeline

In [3]:
pipeline = Pipeline([
  ('onehot', ColumnsOneHotEncoder(columns_to_onehot=['station_id', 'weekday', 'month', 'day', 'hour', 'post_code'])),
  ('scaler', StandardScaler()),
  ('imputer', SimpleImputer(strategy='mean'))
])

# 3. Split

In [4]:
dataset = DATASET[FEATURES]

y = dataset['percentage_docks_available']
X = dataset.drop('percentage_docks_available', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# 4. Build

In [5]:
# Fit the pipeline
pipeline.fit(X_train)

# Transform the training and testing data
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

# Fit the model and make predictions
model = CatBoostRegressor(verbose=0, depth=10, l2_leaf_reg=3, learning_rate=0.1)
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_test_transformed)

# Evaluate the model
rmse, mae, r2 = eval_metrics(y_test, y_pred)

# Print the evaluation metrics
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R2 Score: {r2}")

RMSE: 0.10194847369052555
MAE: 0.06813020284460979
R2 Score: 0.8500652236647668


# 5. Kaggle

In [7]:
DATASET_SAMPLE = pd.read_csv(get_absolute_path('../data/processed/groupby/metadata_sample_submission_2024_features.csv'))

dataset_sample = DATASET_SAMPLE[[feature for feature in FEATURES if feature != 'percentage_docks_available']]

dataset_sample_transformed = pipeline.transform(dataset_sample)

y_pred_sample = model.predict(dataset_sample_transformed)

prediction_df = pd.DataFrame(y_pred_sample, columns=["percentage_docks_available"])
prediction_df.index.name="index"
prediction_df.to_csv(get_absolute_path('../data/processed/groupby/metadata_sample_submission_2024_features_predictions.csv'))