# Load dataset

In [20]:
from ucimlrepo import fetch_ucirepo
import pandas as pd

# Fetch the Bike Sharing Dataset using ucimlrepo
bike_sharing = fetch_ucirepo(id=275)

# Dataset info
print(bike_sharing.metadata.abstract)

This dataset contains the hourly and daily count of rental bikes between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information.


In [21]:
bike_sharing.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,instant,ID,Integer,,record index,,no
1,dteday,Feature,Date,,date,,no
2,season,Feature,Categorical,,"1:winter, 2:spring, 3:summer, 4:fall",,no
3,yr,Feature,Categorical,,"year (0: 2011, 1: 2012)",,no
4,mnth,Feature,Categorical,,month (1 to 12),,no
5,hr,Feature,Categorical,,hour (0 to 23),,no
6,holiday,Feature,Binary,,weather day is holiday or not (extracted from ...,,no
7,weekday,Feature,Categorical,,day of the week,,no
8,workingday,Feature,Binary,,"if day is neither weekend nor holiday is 1, ot...",,no
9,weathersit,Feature,Categorical,,"- 1: Clear, Few clouds, Partly cloudy, Partly ...",,no


Visualize features (X) and targets (y)

In [22]:
# Data about users is aggregated (casual and registered is simply discarded)

df = bike_sharing.data.features 
df

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000
1,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000
2,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000
3,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000
4,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642
17375,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642
17376,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642
17377,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343


In [23]:
y = bike_sharing.data.targets 
y

Unnamed: 0,cnt
0,16
1,40
2,32
3,13
4,1
...,...
17374,119
17375,89
17376,90
17377,61


Preprocess dataset by removing useless attributes

In [24]:
def preprocess_dataset(X):
    """
    Preprocess the dataset by removing unnecessary attributes.
    """
    return X.drop(columns=["dteday", "yr"])

X = preprocess_dataset(df)

### Decision tree

In [25]:
from sklearn.tree import DecisionTreeRegressor

clf_tree = DecisionTreeRegressor(random_state=123)

Let's find the best parameters with a grid search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

param_grid = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

grid_search = GridSearchCV(
    estimator=clf_tree,
    param_grid=param_grid,
)

grid_search.fit(X, y)

best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

y_pred = best_model.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R²: {r2:.4f}")

### Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

clf_lr = LinearRegression()

Evaluate the model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

clf_lr.fit(X_train, y_train)

# Evaluation
y_pred = clf_lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE (Linear): {mse:.2f}")
print(f"R² (Linear): {r2:.4f}")