# Homework 6: **Decision Trees and Ensemble Learning**

## 1. Import packages

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from math import sqrt

## 2. Load and prepare data

Load data

In [3]:
df_raw = pd.read_csv("data/car_fuel_efficiency.csv")
df = df_raw.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


Fill missing values with zeros

In [4]:
df = df.fillna(0)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9704 non-null   float64
 2   horsepower           9704 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         9704 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9704 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


Separate features/target


In [6]:
target = 'fuel_efficiency_mpg'
y = df[target].values
X = df.drop(columns=[target])

Split data into train/validation/test - 60%/20%/20% distribution

In [7]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

Turn features to numeric matrix with DictVectorizer

In [8]:
dv = DictVectorizer(sparse=True)
X_train_dv = dv.fit_transform(X_train.to_dict(orient='records'))
X_val_dv = dv.transform(X_val.to_dict(orient='records'))
X_test_dv = dv.transform(X_test.to_dict(orient='records'))

In [13]:
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)

X_train shape: (5822, 10)
X_val shape: (1941, 10)
X_test shape: (1941, 10)


## 3. Question 1: train a decision tree regressor

Set max_depth = 1

Get features names

In [14]:
feature_names = dv.get_feature_names_out()
feature_names

array(['acceleration', 'drivetrain=All-wheel drive',
       'drivetrain=Front-wheel drive', 'engine_displacement',
       'fuel_type=Diesel', 'fuel_type=Gasoline', 'horsepower',
       'model_year', 'num_cylinders', 'num_doors', 'origin=Asia',
       'origin=Europe', 'origin=USA', 'vehicle_weight'], dtype=object)

Instantiate and train decision tree regressor

In [15]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_dv, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


Get feature used for splitting the data

In [17]:
root_feature_idx = dt.tree_.feature[0]
split_feature = feature_names[root_feature_idx] if root_feature_idx >= 0 else None
split_base = (
    split_feature.split('=')[0] if split_feature and '=' in split_feature else split_feature)

print("Split feature, raw DV name:", split_feature)
print("Split feature - base:", split_base)

Split feature, raw DV name: vehicle_weight
Split feature - base: vehicle_weight


## 4. Question 2: Train a random forest regressor

Set:
- n_estimators = 10
- random_state = 1
- n_jobs = -1

RMSE metric helper function

Instantiate and train a random forest regressor

In [20]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train_dv, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Predict on validation set

In [21]:
y_val_rf_pred = rf.predict(X_val_dv)

Evaluate rf on validation set

In [24]:
rmse_rf = root_mean_squared_error(y_val, y_val_rf_pred)
rmse_rf

0.4602815367032659

## 5. Question 3: Experiment with different n_estimators parameter values
- Try n_estimators values from 10 to 200 with step 10.
- Set random_state = 1

In [28]:
ns = list(range(10, 201, 10))
rf_ws = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1, warm_start=True)
rf_ws.fit(X_train_dv, y_train)

rmses = []
best_rmse_rounded = None
last_improvement_n = None

for i, n in enumerate(ns):
    if i > 0:
        rf_ws.set_params(n_estimators=n)
        rf_ws.fit(X_train_dv, y_train)
    r = root_mean_squared_error(y_val, rf_ws.predict(X_val_dv))
    rmses.append((n, r))
    r3 = round(r, 3)
    if best_rmse_rounded is None or r3 < best_rmse_rounded:
        best_rmse_rounded = r3
        last_improvement_n = n

q3_df = pd.DataFrame(rmses, columns=['n_estimators', 'val_RMSE'])
print("RMSE vs n_estimators on validation set:\n", q3_df)

RMSE vs n_estimators on validation set:
     n_estimators  val_RMSE
0             10  0.460282
1             20  0.446157
2             30  0.439778
3             40  0.438394
4             50  0.437170
5             60  0.435591
6             70  0.436112
7             80  0.436055
8             90  0.435410
9            100  0.435277
10           110  0.434897
11           120  0.435467
12           130  0.434923
13           140  0.435107
14           150  0.435191
15           160  0.435237
16           170  0.435208
17           180  0.435240
18           190  0.435398
19           200  0.435003


In [30]:
print("Last estimator improvement:", last_improvement_n)
print("Best RMSE:", best_rmse_rounded)

Last estimator improvement: 90
Best RMSE: 0.435
