In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge


## Part A: Data Preprocessing and Baseline

### 1.​ Data Loading and Feature Engineering

In [2]:
hour_df = pd.read_csv('hour.csv')
hour_df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


#### 1.1 Drop Irrelevant Columns

In [3]:
hour_df.drop(['instant', 'dteday', 'casual', 'registered'], axis=1, inplace=True)

#### 1.2 Null Values Check

- I am not checking for null values as mentioned in data description there are no missing valaues

### 1.3 Coverting Categorical Features into Numerical

In [4]:
ohe = OneHotEncoder(sparse_output=False, drop='first')
categorical_features = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']
ohe_features = ohe.fit_transform(hour_df[categorical_features])
ohe_feature_names = ohe.get_feature_names_out(categorical_features)
ohe_df = pd.DataFrame(ohe_features, columns=ohe_feature_names)
hour_df = pd.concat([hour_df.drop(columns=categorical_features), ohe_df], axis=1)

In [5]:
hour_df.head()

Unnamed: 0,temp,atemp,hum,windspeed,cnt,season_2,season_3,season_4,yr_1,mnth_2,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_1,weathersit_2,weathersit_3,weathersit_4
0,0.24,0.2879,0.81,0.0,16,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.22,0.2727,0.8,0.0,40,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.22,0.2727,0.8,0.0,32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.24,0.2879,0.75,0.0,13,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.24,0.2879,0.75,0.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### 2.​ Train/Test Split

In [6]:
X = hour_df.drop('cnt', axis=1)
y = hour_df['cnt']

In [7]:

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42,
)


### 3.​ Baseline Model

#### 3.1 Decision Tree Regressor

In [8]:
dt_model = DecisionTreeRegressor(
    random_state=42,
    max_depth=6,
)
dt_model.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [9]:
dt_preds = dt_model.predict(X_test)
dt_mse = mean_squared_error(y_test, dt_preds)
dt_rmse = dt_mse ** 0.5

print(f"RMSE of Decision Tree Regressor is : {dt_rmse:.2f}")

RMSE of Decision Tree Regressor is : 118.46


#### 3.2 Linear Regression

In [10]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [11]:
lr_preds = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_preds)
lr_rmse = lr_mse ** 0.5

print(f"RMSE of Linear Regression is : {lr_rmse:.2f}")

RMSE of Linear Regression is : 100.45


## Part B: Ensemble Techniques for Bias and Variance Reduction

### 1.1 Bagging (Variance Reduction)

In [12]:
bag_model = BaggingRegressor(
    estimator=dt_model,
    n_estimators=100,
    random_state=42
)

bag_model.fit(X_train, y_train)


0,1,2
,estimator,DecisionTreeR...ndom_state=42)
,n_estimators,100
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,42

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [13]:
bg_preds = bag_model.predict(X_test)
bg_mse = mean_squared_error(y_test, bg_preds)
bg_rmse = bg_mse ** 0.5

print(f"RMSE of Bagging Regressor is : {bg_rmse:.2f}")

RMSE of Bagging Regressor is : 112.27


### 1.1.1 Performance Comparision of Bagging Vs Single Decision Tree

| Model                        | RMSE   |
|------------------------------|--------|
| Decision Tree Regressor      | 118.46 |
| Bagging Regressor| 112.27 |

- We can see that the **Bagging Regressor** has significantly low RMSE when compared to **Decision Tree Regressor**

- This is because the single decision tree is known for its **high variance** even a small change in data the tree will change its structure drastically

- Bagging reduces this by **averaging predictions** from many trees trained on slightly different random samples.

- The averaging process effectively reduces the variance leading to a more robust model that generalizes better which results in a **low RMSE**

### 1.2 Boosting (Bias Reduction)

In [14]:
gbr_model = GradientBoostingRegressor(random_state=42)

gbr_model.fit(X_train, y_train)

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [15]:
gbr_preds = gbr_model.predict(X_test)
gbr_mse = mean_squared_error(y_test, gbr_preds)
gbr_rmse = gbr_mse ** 0.5

print(f"RMSE of Gradient Boosting Regressor is : {gbr_rmse:.2f}")

RMSE of Gradient Boosting Regressor is : 78.97


### 1.2.1 Performance Comparision of Bagging Vs Single Decision Tree vs Boosting

| Model                        | RMSE   |
|------------------------------|--------|
| Decision Tree Regressor      | 118.46 |
| Gradient Boosting Regressor  | 78.97  |
| Bagging Regressor| 112.27 |

- Out of **Decision Tree Regressor** , **Gradient Boosting Regressor** and **Bagging Regressor** Gradient Boosting Regressor achieved the best results with a RMSE of **78.97**

- This Single Decision Tree suffered from high variance, which Bagging helped reduce by averaging. However the drop was not too significant. This suggests the base model also suffered from high **bias**.

- Boosting effectively reduced bias by sequentially training multiple weak learners, where each tree focused on correcting the errors made by the previous ones.

- As a result the bias of the model was reduced leading to a more robust model that generalizes better which results in a **low RMSE**

## Part C: Stacking for Optimal Performance

### 1.​ Stacking Implementation

#### 1.1 Principle of Stacking

- **Stacking** is an **ensemble** learning technique where **multiple diverse models** are trained on the same dataset and then a **meta learner** is trained to combine their predictions in an optimal way

**How Stacking Works ??**

- First we will train several models independently on the training data.

- Each model or **base learner** predicts outputs. These predictions are then used as features to train another model (**meta-learner**).

- Its jobs is to learn the **best possible** way to combine the predictions from the base-learners.

**How the Meta-Learner Combines Predictions ??**

- The meta-learner takes the predictions from all base models as inputs and learns weights for each.

- For example in above models bosting performed better so it will assign a **higher weight** to the Boosting model's predictions.

- Essentially it is learning an **optimal way** to combine all precidiction using those weights it learn so that it minimizes the **final prediction error**.



#### 1.2 Base Learners

In [16]:
knr_model = KNeighborsRegressor(n_neighbors=5)
stkr_model = StackingRegressor(
    estimators=[
        ('bg', bag_model),
        ('gbr', gbr_model),
        ('knr', knr_model)
    ],
    final_estimator= Ridge(alpha=1.0)
)
stkr_model.fit(X_train, y_train)

0,1,2
,estimators,"[('bg', ...), ('gbr', ...), ...]"
,final_estimator,Ridge()
,cv,
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


### 2.​ Final Evaluation

In [17]:
stkr_preds = stkr_model.predict(X_test)
stkr_mse = mean_squared_error(y_test, stkr_preds)
stkr_rmse = stkr_mse ** 0.5

print(f"RMSE of Stacking Regressor is : {stkr_rmse:.2f}")

RMSE of Stacking Regressor is : 67.05


## Part D: Final Analysis

### 1.​ Comparative Table

| Model                        | RMSE   |
|------------------------------|--------|
| Stacking Regressor | 67.05
| Gradient Boosting Regressor  | 78.97  |
| Linear Regressor      | 100.45 |
| Bagging Regressor| 112.27 |


### 2.​ Conclusion

- The best performing model is **Stacking Regressor** it achieved a RMSE of **67.05**.

**The Bias Variance TradeOff**
- The Baseline Linear Regression model has **high bias** and **low variance**. It It captures only linear relationships which limits its ability to identify **non-linear patterns** in the data.

- The **Bagging Regressor** helped in reducing variance by averaging predictions from multiple Decision Trees.However since all base learners were similar (Decision Trees with depth 6)it did not significantly reduce bias which is why the improvement was modest.

- The **Gradient Boosting Regressor**  effectively reduced bias by sequentially training multiple weak learners, where each tree focused on correcting the errors made by the previous ones. This approach significantly improved accuracy compared to both the single and bagged models as it gradually minimized the systematic errors. But it can be prone to **overfitting** because continues to learn and correct even very small residual errors including the **noise** in the training data.

- In contrast **Stacking Regressor** combines multiple models(base learners) and combining their results in an **optimal way** using meta learners

- By this approach we ensuring model diversity and leverages strengths of base learners while minimizing their errors.

- This helps to model to capture both linear and non-linear relationship resulting in a more generalized model and also its helps in finding the best overall **bias-variance trade-off**.

- As a result Stacking Regressor achieved the lowest RMSE 
