In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from joblib import Memory
import tempfile

# Load the dataset
file_path = '../../data/raw/volumes_atr_cyclists_shortterm.csv'
df = pd.read_csv(file_path)

#Convert datetime strings to datetime objects
df['datetime_bin_start'] = pd.to_datetime(df['datetime_bin_start'])

# Extract additional time-based features
df['hour'] = df['datetime_bin_start'].dt.hour
df['day_of_week'] = df['datetime_bin_start'].dt.dayofweek
df['month'] = df['datetime_bin_start'].dt.month

# Define features
numeric_features = ['hour', 'day_of_week', 'month', 'daily_temperature', 'daily_precipitation']
categorical_features = [ 'direction','location','class_type']

# Combine all features
features = numeric_features + categorical_features
X = df[features]
y = df['volume']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a temporary directory for caching
cachedir = tempfile.mkdtemp()

# Initialize memory for caching
memory = Memory(location=cachedir, verbose=0)

# Create preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

Support Vector Regression (SVR):

In [7]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

svr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', SVR())])

svr_pipeline.fit(X_train, y_train)
svr_predictions = svr_pipeline.predict(X_test)

svr_mse = mean_squared_error(y_test, svr_predictions)
svr_r2 = r2_score(y_test, svr_predictions)

print("\nSupport Vector Regression Results:")
print(f"Mean Squared Error: {svr_mse}")
print(f"R2 Score: {svr_r2}")


Support Vector Regression Results:
Mean Squared Error: 2604.806651510767
R2 Score: 0.24585456994800547


Gradient Boosting Regression:

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', GradientBoostingRegressor(random_state=42))
                               ], memory=memory)

gbr_pipeline.fit(X_train, y_train)
gbr_predictions = gbr_pipeline.predict(X_test)

gbr_mse = mean_squared_error(y_test, gbr_predictions)
gbr_r2 = r2_score(y_test, gbr_predictions)

print("\nGradient Boosting Regression Results:")
print(f"Mean Squared Error: {gbr_mse}")
print(f"R2 Score: {gbr_r2}")


Gradient Boosting Regression Results:
Mean Squared Error: 1686.6199945687417
R2 Score: 0.5116886082885992


K-Nearest Neighbors Regression:

In [10]:
from sklearn.neighbors import KNeighborsRegressor

knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', KNeighborsRegressor())])

knn_pipeline.fit(X_train, y_train)
knn_predictions = knn_pipeline.predict(X_test)

knn_mse = mean_squared_error(y_test, knn_predictions)
knn_r2 = r2_score(y_test, knn_predictions)

print("\nK-Nearest Neighbors Regression Results:")
print(f"Mean Squared Error: {knn_mse}")
print(f"R2 Score: {knn_r2}")


K-Nearest Neighbors Regression Results:
Mean Squared Error: 1222.2498680926915
R2 Score: 0.6461333696805687


In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the pipeline
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', RandomForestRegressor(random_state=42))])

# Define the parameter grid
param_grid = {
    'regressor__n_estimators': [20, 60, 80],
    'regressor__max_depth': [3, 5, 10, 15],  
    'regressor__min_samples_split': [2, 5, 8],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Create the grid search object
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best negative MSE:", grid_search.best_score_)

# Use the best model for predictions
best_rf_model = grid_search.best_estimator_
rf_predictions = best_rf_model.predict(X_test)

# Evaluate the model
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print("\nRandom Forest Regression Results:")
print(f"Mean Squared Error: {rf_mse}")
print(f"R2 Score: {rf_r2}")

Best parameters: {'regressor__max_depth': 15, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 60}
Best negative MSE: -1171.7833983858823

Random Forest Regression Results:
Mean Squared Error: 1058.8169356315643
R2 Score: 0.6934505857449818
