In [1]:
import sys
# Add the path to the 'src' directory, not the 'src/utils.py' file
sys.path.append('../src')  # This adds the 'src' directory to the sys.path

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor

from utils import get_absolute_path, eval_metrics, ColumnsOneHotEncoder



# 1. Dataset

In [2]:
FEATURES = ['percentage_docks_available', 'station_id', 'post_code', 'altitude', 'laboral_day', 'weekday', 'month', 'day', 'hour', 'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1']

TRAIN_FEATURES = {
  'all': FEATURES,
  'remove_station_id': [feature for feature in FEATURES if feature != 'station_id'],
  'remove_post_code': [feature for feature in FEATURES if feature != 'post_code'],
  # 'remove_altitude': [feature for feature in FEATURES if feature != 'altitude'],
  # 'remove_laboral_day': [feature for feature in FEATURES if feature != 'laboral_day'],
  # 'remove_weekday': [feature for feature in FEATURES if feature != 'weekday'],
}

DATASET = pd.read_csv(get_absolute_path('../data/processed/groupby/stations_final_2023.csv'))

# 2. Pipeline

In [3]:
pipeline = Pipeline([
  ('onehot', ColumnsOneHotEncoder(columns_to_onehot=['station_id', 'weekday', 'month', 'day', 'hour', 'post_code'])),
  ('scaler', StandardScaler()),
  ('imputer', SimpleImputer(strategy='mean'))
])

# 3. Models

In [4]:
MODELS = [
    LinearRegression(),
    LinearSVR(dual=False, max_iter=10000),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    CatBoostRegressor(silent=True)
]

In [5]:
import pandas as pd

model_metrics = []  # This will store all results

for key in TRAIN_FEATURES:
    features = TRAIN_FEATURES[key]
    dataset = DATASET[features]
    y = dataset['percentage_docks_available']
    X = dataset.drop('percentage_docks_available', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    pipeline.fit(X_train)
    X_train_transformed = pipeline.transform(X_train)
    X_test_transformed = pipeline.transform(X_test)

    for model in MODELS:
        model_name = type(model).__name__
        model_features = ', '.join([f for f in features if f != 'percentage_docks_available'])

        if model_name == 'LinearSVR':
        # Adjust LinearSVR parameters to avoid the ValueError
            model.set_params(dual=False, max_iter=10000, loss='squared_epsilon_insensitive')

        model.fit(X_train_transformed, y_train)
        y_pred = model.predict(X_test_transformed)
        
        rmse, mae, r2 = eval_metrics(y_test, y_pred)
        model_metrics.append({
            'Key': key,
            'Model': model_name,
            'Features': model_features,
            'RMSE': rmse,
            'MAE': mae,
            'R2 Score': r2
        })

metrics_df = pd.DataFrame(model_metrics)
metrics_df

Unnamed: 0,Key,Model,Features,RMSE,MAE,R2 Score
0,all,LinearRegression,"station_id, post_code, altitude, laboral_day, ...",0.11065,0.074488,0.823378
1,all,LinearSVR,"station_id, post_code, altitude, laboral_day, ...",0.110631,0.074457,0.82344
2,all,DecisionTreeRegressor,"station_id, post_code, altitude, laboral_day, ...",0.143747,0.096439,0.701915
3,all,RandomForestRegressor,"station_id, post_code, altitude, laboral_day, ...",0.108433,0.071725,0.830385
4,all,GradientBoostingRegressor,"station_id, post_code, altitude, laboral_day, ...",0.108977,0.072963,0.828678
5,all,CatBoostRegressor,"station_id, post_code, altitude, laboral_day, ...",0.103206,0.068937,0.846343
6,remove_station_id,LinearRegression,"post_code, altitude, laboral_day, weekday, mon...",0.111062,0.074597,0.82206
7,remove_station_id,LinearSVR,"post_code, altitude, laboral_day, weekday, mon...",0.111055,0.074589,0.822083
8,remove_station_id,DecisionTreeRegressor,"post_code, altitude, laboral_day, weekday, mon...",0.150145,0.101588,0.674792
9,remove_station_id,RandomForestRegressor,"post_code, altitude, laboral_day, weekday, mon...",0.107765,0.072022,0.832469
