### Import the libraries

In [5]:
!pip install -U scikit-learn xgboost



In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

#### Load data

In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [8]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


In [10]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

### Fill missing values

In [12]:
df = df.fillna(0)

### Split into features and target

In [13]:
y = df.fuel_efficiency_mpg.values
del df['fuel_efficiency_mpg']

### Split the data - 60/20/20

In [17]:
df_full_train, df_test, y_full_train, y_test = train_test_split(df, y, test_size=0.2, random_state=1)
df_train, df_val, y_train, y_val = train_test_split(df_full_train, y_full_train, test_size=0.25, random_state=1)

len(df_train), len(df_val), len(df_test)

(5822, 1941, 1941)

### Prepare the data using DictVectorizer

In [19]:
dv = DictVectorizer(sparse=True)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

### Q1 - Decision tree with max_depth=1

In [20]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Check which feature was used

In [21]:
feature_names = dv.get_feature_names_out()
dt.tree_.feature[0]
feature_names[dt.tree_.feature[0]]

'vehicle_weight'

### Q2 - Random forest with n_estimators=10

In [22]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE: {rmse:.3f}')

RMSE: 0.459


### Q3 - Trying different n_estimators

In [23]:
scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append((n, rmse))
    print(f'{n}: {rmse:.3f}')

10: 0.459
20: 0.454
30: 0.451
40: 0.448
50: 0.446
60: 0.445
70: 0.445
80: 0.445
90: 0.445
100: 0.445
110: 0.444
120: 0.444
130: 0.444
140: 0.444
150: 0.443
160: 0.443
170: 0.443
180: 0.443
190: 0.443
200: 0.443


### Check when it stops improving (rounded to 3 decimals)

In [24]:
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'rmse'])
df_scores['rmse_rounded'] = df_scores.rmse.round(3)
df_scores

Unnamed: 0,n_estimators,rmse,rmse_rounded
0,10,0.458662,0.459
1,20,0.45368,0.454
2,30,0.451172,0.451
3,40,0.448357,0.448
4,50,0.446179,0.446
5,60,0.4453,0.445
6,70,0.444674,0.445
7,80,0.444994,0.445
8,90,0.445205,0.445
9,100,0.444896,0.445


### Q4 - Finding best max_depth

In [25]:
depths = [10, 15, 20, 25]
results = {}

for d in depths:
    print(f'\nmax_depth={d}')
    scores = []
    
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        scores.append(rmse)
    
    results[d] = np.mean(scores)
    print(f'mean rmse: {results[d]:.4f}')


max_depth=10
mean rmse: 0.4419

max_depth=15
mean rmse: 0.4456

max_depth=20
mean rmse: 0.4457

max_depth=25
mean rmse: 0.4457


### Q5 - Feature importance

In [26]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Get feature importances

In [27]:
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance.head(10)

# Check specific features
features_to_check = ['vehicle_weight', 'horsepower', 'acceleration', 'engine_displacement']

for f in features_to_check:
    imp = feature_importance[feature_importance.feature.str.contains(f, case=False)].importance.sum()
    print(f'{f}: {imp:.4f}')

vehicle_weight: 0.9592
horsepower: 0.0161
acceleration: 0.0115
engine_displacement: 0.0033


### Q6 - XGBoost with different eta values

In [30]:
# Convert data to DMatrix (without feature_names to avoid the error)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'val')]

# eta = 0.3
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100, 
                  evals=watchlist, verbose_eval=10)

y_pred = model.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE (eta=0.3): {rmse_03:.4f}')

# eta = 0.1
xgb_params['eta'] = 0.1

model = xgb.train(xgb_params, dtrain, num_boost_round=100, 
                  evals=watchlist, verbose_eval=10)

y_pred = model.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE (eta=0.1): {rmse_01:.4f}')

# Compare
print(f'\nComparison:')
print(f'eta=0.3: {rmse_03:.4f}')
print(f'eta=0.1: {rmse_01:.4f}')

if rmse_03 < rmse_01:
    print('Best eta: 0.3')
elif rmse_01 < rmse_03:
    print('Best eta: 0.1')
else:
    print('Both give equal value')

[0]	train-rmse:1.81393	val-rmse:1.85444
[10]	train-rmse:0.37115	val-rmse:0.43896
[20]	train-rmse:0.33553	val-rmse:0.43376
[30]	train-rmse:0.31475	val-rmse:0.43752
[40]	train-rmse:0.30202	val-rmse:0.43968
[50]	train-rmse:0.28456	val-rmse:0.44140
[60]	train-rmse:0.26768	val-rmse:0.44290
[70]	train-rmse:0.25489	val-rmse:0.44531
[80]	train-rmse:0.24254	val-rmse:0.44689
[90]	train-rmse:0.23193	val-rmse:0.44839
[99]	train-rmse:0.21950	val-rmse:0.45018
RMSE (eta=0.3): 0.4502
[0]	train-rmse:2.28944	val-rmse:2.34561
[10]	train-rmse:0.91008	val-rmse:0.94062
[20]	train-rmse:0.48983	val-rmse:0.53064
[30]	train-rmse:0.38342	val-rmse:0.44289
[40]	train-rmse:0.35343	val-rmse:0.42746
[50]	train-rmse:0.33998	val-rmse:0.42498
[60]	train-rmse:0.33054	val-rmse:0.42456
[70]	train-rmse:0.32202	val-rmse:0.42503
[80]	train-rmse:0.31667	val-rmse:0.42563
[90]	train-rmse:0.31059	val-rmse:0.42586
[99]	train-rmse:0.30419	val-rmse:0.42623
RMSE (eta=0.1): 0.4262

Comparison:
eta=0.3: 0.4502
eta=0.1: 0.4262
Best eta: