# DA5401 A8: Ensemble Learning for Complex Regression Modeling on Bike Share Data

## Part A: Data Preprocessing and Baseline [10 points]

In [61]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

### Data Loading and Feature Engineering

In [63]:
from pathlib import Path
BASE_DIR = Path.cwd()

In [64]:
data_dir = os.path.join(BASE_DIR, 'data')
day_data_path = os.path.join(data_dir, 'day.csv')
hour_data_path = os.path.join(data_dir, 'hour.csv')

In [65]:
hour_df = pd.read_csv(hour_data_path)
day_df = pd.read_csv(day_data_path)

# Print the shapes of the datasets
print(f"Hour DataFrame shape: {hour_df.shape}")
print(f"Day DataFrame shape: {day_df.shape}")

Hour DataFrame shape: (17379, 17)
Day DataFrame shape: (731, 16)


In [66]:
print("Hour DataFrame:")
hour_df.head()

Hour DataFrame:


Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [67]:
print("\nHour DataFrame Description:")
hour_df.describe()


Hour DataFrame Description:


Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,5017.0295,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,8690.0,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,13034.5,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,17379.0,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0


In [68]:
print("\nDay DataFrame:")
day_df.head()


Day DataFrame:


Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [69]:
print("\nDay DataFrame Description:")
day_df.describe()


Day DataFrame Description:


Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,2.49658,0.500684,6.519836,0.028728,2.997264,0.683995,1.395349,0.495385,0.474354,0.627894,0.190486,848.176471,3656.172367,4504.348837
std,211.165812,1.110807,0.500342,3.451913,0.167155,2.004787,0.465233,0.544894,0.183051,0.162961,0.142429,0.077498,686.622488,1560.256377,1937.211452
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,2.0,20.0,22.0
25%,183.5,2.0,0.0,4.0,0.0,1.0,0.0,1.0,0.337083,0.337842,0.52,0.13495,315.5,2497.0,3152.0
50%,366.0,3.0,1.0,7.0,0.0,3.0,1.0,1.0,0.498333,0.486733,0.626667,0.180975,713.0,3662.0,4548.0
75%,548.5,3.0,1.0,10.0,0.0,5.0,1.0,2.0,0.655417,0.608602,0.730209,0.233214,1096.0,4776.5,5956.0
max,731.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463,3410.0,6946.0,8714.0


In [70]:
# Missing Values
print("Missing Values in Hour DataFrame:")
print(hour_df.isnull().sum())

print("\nMissing Values in Day DataFrame:")
print(day_df.isnull().sum())

Missing Values in Hour DataFrame:
instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

Missing Values in Day DataFrame:
instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64


In [71]:
# Drop irrelevant columns like instant, dteday, casual, and registered 
cols_to_drop = ['instant', 'dteday', 'casual', 'registered']
hour_df.drop(columns=cols_to_drop, axis=1, inplace=True)

print(f"Hour DataFrame Shape after dropping columns: {hour_df.shape}")
hour_df.head()

Hour DataFrame Shape after dropping columns: (17379, 13)


Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1


In [72]:
# Convert categorical features (e.g., season, weathersit, mnth, hr) into a numerical format suitable for regression models (e.g., One-Hot Encoding).
# List of categorical columns to encode
categorical_features = ['season', 'weathersit', 'mnth', 'hr', 'weekday']

# Apply one-hot encoding
hour_df_encoded = pd.get_dummies(hour_df, columns=categorical_features, drop_first=True, dtype=int)

In [73]:
# Display the first 5 rows of the encoded DataFrame
hour_df_encoded.head()

Unnamed: 0,yr,holiday,workingday,temp,atemp,hum,windspeed,cnt,season_2,season_3,...,hr_20,hr_21,hr_22,hr_23,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,0,0,0,0.24,0.2879,0.81,0.0,16,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0.22,0.2727,0.8,0.0,40,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0.22,0.2727,0.8,0.0,32,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0.24,0.2879,0.75,0.0,13,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0.24,0.2879,0.75,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [74]:
# Check the new shape
print("Shape of the encoded DataFrame:", hour_df_encoded.shape)

Shape of the encoded DataFrame: (17379, 54)


### Train/Test Split

In [75]:
# Define features (X) and target (y)
X = hour_df_encoded.drop('cnt', axis=1)
y = hour_df_encoded['cnt']

In [76]:
# Print the shapes of X and y
print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (17379, 53)
Target shape: (17379,)


In [77]:
from sklearn.model_selection import train_test_split

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
# Print the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (13903, 53)
X_test shape: (3476, 53)
y_train shape: (13903,)
y_test shape: (3476,)


### Baseline Model (Single Regressor)

In [80]:
# Train a single Linear Regression model on the training data
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
lr_model = LinearRegression()

# Train the model on the training data
lr_model.fit(X_train, y_train)

print("Linear Regression model trained successfully.")

Linear Regression model trained successfully.


In [81]:
# Train a single Decision Tree Regressor (use a max depth of 6) on the training data
from sklearn.tree import DecisionTreeRegressor

# Initialize the Decision Tree Regressor with max_depth=6
dt_model = DecisionTreeRegressor(max_depth=6, random_state=42)

# Train the model on the training data
dt_model.fit(X_train, y_train)

print("Decision Tree Regressor model trained successfully.")

Decision Tree Regressor model trained successfully.


In [83]:
# Evaluate both models on the test set using the Root Mean Squared Error(RMSE). 

from sklearn.metrics import mean_squared_error

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)

# Calculate RMSE for Linear Regression and Decision Tree Regressor
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))

print(f"Linear Regression RMSE: {rmse_lr:.4f}")
print(f"Decision Tree Regressor RMSE: {rmse_dt:.4f}")

Linear Regression RMSE: 100.4459
Decision Tree Regressor RMSE: 118.4555


In [84]:
# Use the better of the two single models as your baseline performance metric

if rmse_dt < rmse_lr:
    print(f"\nThe Decision Tree Regressor is the better baseline model.")
    baseline_rmse = rmse_dt
else:
    print(f"\nThe Linear Regression model is the better baseline model.")
    baseline_rmse = rmse_lr

print(f"Baseline RMSE to beat: {baseline_rmse:.4f}")


The Linear Regression model is the better baseline model.
Baseline RMSE to beat: 100.4459


## Part B: Ensemble Techniques for Bias and Variance Reduction [20 points]

### Bagging (Variance Reduction)

Hypothesis: Bagging primarily targets variance reduction.

The core idea behind Bagging (Bootstrap Aggregating) is to train multiple base estimators (in our case, Decision Trees) on different random subsets of the training data. The final prediction is the average of all individual tree predictions. This process is highly effective at reducing the model's variance, making it more stable and less prone to overfittin

In [85]:
# Implement a Bagging Regressor using the Decision Tree Regressor (from the baseline) as the base estimator. Use at least 50 estimators
from sklearn.ensemble import BaggingRegressor

# Initialize the Bagging Regressor with Decision Tree as base estimator
base_tree = DecisionTreeRegressor(max_depth=6, random_state=42)
bagging_model = BaggingRegressor(
    estimator=base_tree,
    n_estimators=50,
    random_state=42,
    oob_score=True,  # Enable out-of-bag scoring
    n_jobs=-1  # Use all available cores
)

# Train the Bagging Regressor model
bagging_model.fit(X_train, y_train)

print("Bagging Regressor model trained successfully.")

Bagging Regressor model trained successfully.


In [89]:

# Make predictions on the test set using the Bagging model
y_pred_bagging = bagging_model.predict(X_test)

# Calculate the RMSE for the Bagging model
rmse_bagging = np.sqrt(mean_squared_error(y_test, y_pred_bagging))

print(f"Bagging Regressor RMSE: {rmse_bagging:.4f}")
# Compare with the baseline
print(f"Baseline (Decision Tree) RMSE: {rmse_dt:.4f}")

if rmse_bagging < rmse_dt:
    improvement = ((rmse_dt - rmse_bagging) / rmse_dt) * 100
    print(f"\nThe Bagging model shows an improvement of {improvement:.2f}% over the baseline.")
else:
    print("\nThe Bagging model did not improve upon the baseline.")

Bagging Regressor RMSE: 112.3426
Baseline (Decision Tree) RMSE: 118.4555

The Bagging model shows an improvement of 5.16% over the baseline.


In [91]:
# Calculate and report the RMSE on the test set. 

# Make predictions on the test set using the Bagging model
y_pred_bagging = bagging_model.predict(X_test)

# Calculate the RMSE for the Bagging model
rmse_bagging = np.sqrt(mean_squared_error(y_test, y_pred_bagging))

print(f"Bagging Regressor RMSE: {rmse_bagging:.4f}")

print("--- Comparison ---")
# 1. Compare Bagging vs. its Base Estimator (Single Decision Tree)
print(f"Single Decision Tree RMSE: {rmse_dt:.4f}")
if rmse_bagging < rmse_dt:
    improvement_dt = ((rmse_dt - rmse_bagging) / rmse_dt) * 100
    print(f"Result: Bagging improved upon the single Decision Tree by {improvement_dt:.2f}%.")
else:
    print("Result: Bagging DID NOT improve upon the single Decision Tree.")

print("\n")

# 2. Compare Bagging vs. the Overall Best Baseline Model

print(f"Overall Baseline RMSE (Linear Regression): {baseline_rmse:.4f}")
if rmse_bagging < baseline_rmse:
    improvement_lr = ((baseline_rmse - rmse_bagging) / baseline_rmse) * 100
    print(f"Result: The Bagging improved upon the linear regression model by {improvement_lr:.2f}%.")
else:
    print("Result: The Bagging model IS NOT BETTER than the overall baseline.")

Bagging Regressor RMSE: 112.3426
--- Comparison ---
Single Decision Tree RMSE: 118.4555
Result: Bagging improved upon the single Decision Tree by 5.16%.


Overall Baseline RMSE (Linear Regression): 100.4459
Result: The Bagging model IS NOT BETTER than the overall baseline.


**Is bagging technique effectively reduced variance compared to the single Decision Tree baseline?**

The Bagging Regressor successfully demonstrated its core strength of variance reduction when compared to its base estimator, the single Decision Tree. The single tree produced a Root Mean Squared Error (RMSE) of 118.46, indicating a certain level of prediction error and instability. By training 50 individual Decision Trees on different bootstrap samples of the data and averaging their predictions, the Bagging model achieved a significantly lower RMSE of 112.34. This represents a 5.16% improvement, confirming our hypothesis that the ensemble technique effectively mitigates the high variance often associated with individual decision trees. The averaging process cancels out the idiosyncratic errors and extreme predictions of single trees, resulting in a more stable and reliable model that generalizes better to the unseen test data. This clearly shows that the bagging technique performed exactly as expected by improving its underlying base model.

Despite its success in enhancing the Decision Tree, the Bagging Regressor's final performance did not surpass our overall baseline. The best-performing single model was the Linear Regression, which established a strong baseline RMSE of 100.45. The Bagging model's RMSE of 112.34, while an improvement over the single tree, was ultimately not competitive with this linear baseline. This outcome suggests that the dataset contains strong linear relationships between features (such as temperature and hour of the day) and the number of bike rentals. The simple, rigid structure of the Linear Regression model was exceptionally well-suited to capture these patterns. In this case, the flexibility of the tree-based ensemble was not enough to outperform a simpler model whose assumptions aligned perfectly with the underlying structure of the data, highlighting that more complexity does not always guarantee better results.


### Boosting (Bias Reduction)

Hypothesis: Boosting primarily targets bias reduction

The core idea behind Boosting is to build a model sequentially. Each new model in the sequence focuses on correcting the errors made by the previous one. Instead of a "committee of equals" like in Bagging, Boosting is more like a "team of specialists" where each member learns from the last, creating a powerful and highly accurate final model. It is particularly effective at reducing bias.

In [92]:
# Implement a Gradient Boosting Regressor (a robust, widely used boosting technique)

from sklearn.ensemble import GradientBoostingRegressor

# Initialize the Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

# Train the Gradient Boosting model
gbr_model.fit(X_train, y_train)

print("Gradient Boosting Regressor model trained successfully.")

Gradient Boosting Regressor model trained successfully.


In [93]:
# Calculate and report the RMSE on the test set. 

# Make predictions on the test set using the Gradient Boosting model
y_pred_gbr = gbr_model.predict(X_test)

# Calculate the RMSE for the Gradient Boosting model
rmse_gbr = np.sqrt(mean_squared_error(y_test, y_pred_gbr))

print(f"Gradient Boosting Regressor RMSE: {rmse_gbr:.4f}")
# Compare the Gradient Boosting RMSE with the baseline RMSE
print(f"Overall Baseline RMSE (Linear Regression): {baseline_rmse:.4f}")

if rmse_gbr < baseline_rmse:
    improvement = ((baseline_rmse - rmse_gbr) / baseline_rmse) * 100
    print(f"\nThe Gradient Boosting model IS BETTER than the baseline by {improvement:.2f}%.")
else:
    print("\nThe Gradient Boosting model did not improve upon the baseline.")

Gradient Boosting Regressor RMSE: 78.9652
Overall Baseline RMSE (Linear Regression): 100.4459

The Gradient Boosting model IS BETTER than the baseline by 21.39%.


**Does boosting achieved a better result than both the single model and the bagging ensemble, supporting the idea of bias reduction?**

Yes, the Gradient Boosting model achieved a significantly better result than both the single baseline models and the Bagging ensemble, strongly supporting its effectiveness at bias reduction. The model produced an outstanding RMSE of 78.97 on the test set, marking a substantial 21.39% improvement over the best baseline (Linear Regression, RMSE 100.45). Unlike Bagging, which averages independent models to reduce variance, Boosting builds its trees sequentially. Each new tree is specifically trained to correct the errors or residual bias, left by the previous ones. This iterative process allows the model to learn the more complex, non-linear patterns in the bike-sharing data that the simpler Linear Regression model was unable to capture. The result is a single, powerful predictive model that fits the training data more closely without overfitting, leading to superior accuracy on unseen data.

The superior performance of Gradient Boosting highlights the different strengths of ensemble techniques. While Bagging was successful in its specific goal of stabilizing the single Decision Tree, its final RMSE of 112.34 was not competitive. Its "wisdom of the crowd" approach was not as effective as Boosting's focused, expert-driven method for this problem. The baseline Linear Regression model, while strong, was likely too simple (high bias) and assumed linear relationships that do not fully represent the data's complexity. Gradient Boosting directly addresses this weakness by sequentially chipping away at prediction errors, building a more nuanced understanding of the relationships between features like time and weather and the number of bike rentals. This makes it the most effective strategy so far for this task.

## Part C: Stacking for Optimal Performance [10 points]

### Stacking Implementation

**The principle of Stacking**

Stacking is an advanced ensemble method that combines multiple, diverse models to improve predictive accuracy. It works in two levels: first, several "base" models are trained on the data. Then, a final "meta-model" is trained, not on the original features, but on the predictions made by the base models. This meta-model learns the optimal way to combine the outputs from the base models, leveraging the unique strengths of each to make a more powerful and robust final prediction.

**How the Meta-Learner learns to combine the predictions of diverse Base Learners optimally?**

The meta-learner treats the predictions from the base models as a new set of input features. It is then trained using these predictions to forecast the actual target values. During this process, it learns the strengths and weaknesses of each base learner, figuring out which ones are more reliable and under what conditions. It might assign more weight to one model's predictions over another's, effectively discovering the optimal formula to combine their strengths for the most accurate final result.

In [94]:
# Define the following three Base Learners (Level-0):
#     ■ K-Nearest Neighbors Regressor (KNeighborsRegressor)
#     ■ Bagging Regressor (from Part B)
#     ■ Gradient Boosting Regressor (from Part B)

from sklearn.neighbors import KNeighborsRegressor

# Define the list of base learners (Level-0)
base_learners = [
    ('knn', KNeighborsRegressor(n_neighbors=10)),
    ('bagging', BaggingRegressor(estimator=DecisionTreeRegressor(max_depth=6),
                                n_estimators=50,
                                random_state=42,
                                n_jobs=-1)),
    ('gbr', GradientBoostingRegressor(n_estimators=100,
                                      learning_rate=0.1,
                                      random_state=42))
]

print("Base learners for Stacking defined successfully.")

Base learners for Stacking defined successfully.


In [95]:
# Define the Meta-Learner (Level-1): Use a simple Ridge Regression model

from sklearn.linear_model import Ridge

# Define the meta-learner (Level-1)
meta_learner = Ridge(random_state=42)

print("Meta-learner (Ridge Regression) defined successfully.")

Meta-learner (Ridge Regression) defined successfully.


In [96]:
# Implement a Stacking Regressor combining these base and meta learners.

from sklearn.ensemble import StackingRegressor

# Create the Stacking Regressor
stacking_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,  
    n_jobs=-1
)

print("Stacking Regressor created successfully.")

Stacking Regressor created successfully.


### Final Evaluation

In [97]:
# Calculate and report the RMSE for the Stacking Regressor on the test set.

# Train the Stacking Regressor model
stacking_model.fit(X_train, y_train)

print("Stacking Regressor model trained successfully.")

Stacking Regressor model trained successfully.


In [98]:
# Make predictions on the test set using the Stacking model
y_pred_stacking = stacking_model.predict(X_test)

# Calculate the RMSE for the Stacking model
rmse_stacking = np.sqrt(mean_squared_error(y_test, y_pred_stacking))

print(f"Stacking Regressor RMSE: {rmse_stacking:.4f}")

Stacking Regressor RMSE: 68.4148


## Part D: Final Analysis [10 points]

### Comparative Table

In [99]:
# Comparative Table: Create a clear table summarizing the RMSE of all five models:
#     ○ Baseline Single Model (Best of DT/Linear)
#     ○ Bagging Regressor
#     ○ Gradient Boosting Regressor
#     ○ Stacking Regressor


# Create a dictionary with the model names and their RMSE scores
results_data = {
    'Model': [
        'Baseline (Linear Regression)',
        'Bagging Regressor',
        'Gradient Boosting Regressor',
        'Stacking Regressor'
    ],
    'RMSE': [
        baseline_rmse,
        rmse_bagging,
        rmse_gbr,
        rmse_stacking
    ]
}

# Create a pandas DataFrame from the dictionary
results_df = pd.DataFrame(results_data)

# Sort the DataFrame by RMSE in ascending order to easily see the best model
results_df = results_df.sort_values(by='RMSE', ascending=True)

# Set the 'Model' column as the index for a cleaner look
results_df.set_index('Model', inplace=True)

# Format the RMSE column to display with 2 decimal places
pd.options.display.float_format = '{:.2f}'.format

# Display the final comparative table
print("--- Final Model Performance Comparison ---")
print(results_df)

--- Final Model Performance Comparison ---
                               RMSE
Model                              
Stacking Regressor            68.41
Gradient Boosting Regressor   78.97
Baseline (Linear Regression) 100.45
Bagging Regressor            112.34


### Conclusion

**What is our best-performing model?**


Based on the final results, the *Stacking Regressor* was unequivocally the best-performing model for this task.
It achieved the lowest Root Mean Squared Error (RMSE) of 68.41, demonstrating the highest accuracy in predicting hourly bike rentals. This result was a significant improvement not only over the baseline (100.45) but also over the next best model, the Gradient Boosting Regressor (78.97). The Stacking model's superior performance highlights the power of combining diverse modeling strategies. By using the predictions of the Bagging, Gradient Boosting, and KNN models as inputs, the Ridge meta-learner was able to learn the optimal way to weigh each model's strengths and weaknesses. This allowed it to capture a more nuanced and complex set of patterns than any single model could on its own, leading to the most robust and accurate predictions overall.

**Explain why the Stacking Regressor (or the best ensemble) outperformed the single model baseline, referencing the concepts of the bias-variance trade-off and model diversity?**

The Stacking Regressor emerged as the definitive best-performing model, achieving the lowest RMSE of 68.41. Its success over the single model baseline and other ensembles can be directly attributed to its intelligent use of model diversity and its effective management of the bias-variance trade-off. Our baseline model, a simple Linear Regression, was a low-variance but high-bias model. It made strong, rigid assumptions about the linear nature of the data and was incapable of capturing the complex, non-linear relationships inherent in bike rental demand, such as the distinct morning and evening rush hours or the nuanced effects of different weather conditions. While this simplicity made it stable, its high bias fundamentally limited its predictive ceiling, resulting in a modest RMSE of 100.45.

Ensemble methods are designed to overcome the limitations of such single estimators. The Gradient Boosting Regressor, our second-best model, performed exceptionally well by tackling the bias problem head-on. It sequentially built models to correct the errors of its predecessors, creating a powerful, low-bias predictor that learned the data's intricate patterns. However, Stacking took this a step further by embracing model diversity. It didn't rely on just one strong technique; instead, it combined three fundamentally different approaches as its base learners: the bias-reducing Gradient Boosting Regressor, the variance-reducing Bagging Regressor, and a completely distinct instance-based model, K-Nearest Neighbors. These models "see" the data in different ways and, crucially, make different types of errors.

The genius of the Stacking model lies in its meta-learner. This final Ridge Regression model was trained not on the original data, but on the predictions generated by this diverse team of base models. It effectively learned to act as an intelligent manager, identifying the specific strengths and weaknesses of each base learner and how to best combine their outputs. For instance, it might learn to trust the Gradient Boosting model's predictions for typical weekday rush hours but give more weight to the KNN or Bagging model's predictions during holidays or unusual weather events. By learning this optimal, weighted combination, the Stacking Regressor achieved the best of all worlds—synthesizing the low-bias strengths of a powerful model like Gradient Boosting with the stability and unique perspectives of others. This produced a final model with a more sophisticated and robust understanding of the data, resulting in the lowest overall error and the most accurate predictions.