In [3]:
import numpy as np
import pandas as pd


In [4]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-11-03 17:23:38--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.1’


2025-11-03 17:23:38 (3.75 MB/s) - ‘car_fuel_efficiency.csv.1’ saved [874188/874188]



In [5]:
df = pd.read_csv("car_fuel_efficiency.csv")

In [6]:
df = df.fillna(0)

In [7]:
target = "fuel_efficiency_mpg"
y = df[target].values
X = df.drop(columns=[target])

In [8]:
from sklearn.model_selection import train_test_split

X_full_train, X_temp, y_full_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=1)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1)

print(len(X_full_train), len(X_val), len(X_test))


5822 1941 1941


In [9]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)

train_dicts = X_full_train.to_dict(orient="records")
val_dicts = X_val.to_dict(orient="records")
test_dicts = X_test.to_dict(orient="records")

X_train = dv.fit_transform(train_dicts)

X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)


In [10]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_full_train)

import numpy as np

feature_index = dt.tree_.feature[0]
split_feature = dv.feature_names_[feature_index]

split_feature


'vehicle_weight'

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)

rf.fit(X_train, y_full_train)

y_pred = rf.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse


np.float64(0.4602815367032658)

In [12]:
print(rmse)

0.4602815367032658


In [13]:
scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(
        n_estimators=n,
        random_state=1,
        n_jobs=-1
    )
    rf.fit(X_train, y_full_train)
    y_pred = rf.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append((n, rmse))

scores


[(10, np.float64(0.4602815367032659)),
 (20, np.float64(0.44615674589110027)),
 (30, np.float64(0.4397780761280069)),
 (40, np.float64(0.4383939265191818)),
 (50, np.float64(0.43717032494674524)),
 (60, np.float64(0.4355914081920472)),
 (70, np.float64(0.4361123859130258)),
 (80, np.float64(0.43605455887808786)),
 (90, np.float64(0.4354100823440765)),
 (100, np.float64(0.4352773655478666)),
 (110, np.float64(0.434896815770466)),
 (120, np.float64(0.43546652508605704)),
 (130, np.float64(0.43492336206666454)),
 (140, np.float64(0.43510682291642017)),
 (150, np.float64(0.4351910645153306)),
 (160, np.float64(0.43523690427566636)),
 (170, np.float64(0.43520773900215154)),
 (180, np.float64(0.4352404093499596)),
 (190, np.float64(0.43539799338117574)),
 (200, np.float64(0.4350031248889441))]

In [14]:
for n, rmse in scores:
    print(n, round(rmse, 3))


10 0.46
20 0.446
30 0.44
40 0.438
50 0.437
60 0.436
70 0.436
80 0.436
90 0.435
100 0.435
110 0.435
120 0.435
130 0.435
140 0.435
150 0.435
160 0.435
170 0.435
180 0.435
190 0.435
200 0.435


In [15]:
max_depth_values = [10, 15, 20, 25]
n_estimators_values = range(10, 201, 10)

results = {}

for max_depth in max_depth_values:
    rmses = []
    for n in n_estimators_values:
        model = RandomForestRegressor(
            n_estimators=n,
            max_depth=max_depth,
            random_state=1,
            n_jobs=-1
        )
        model.fit(X_train, y_full_train)
        y_pred = model.predict(X_val)

        # compute RMSE without using `squared` kwarg
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmses.append(rmse)

    mean_rmse = float(np.mean(rmses))
    results[max_depth] = mean_rmse
    print(f"max_depth={max_depth}, mean RMSE = {mean_rmse:.3f}")

# find best max_depth (lowest mean RMSE)
best_depth = min(results, key=results.get)
print("\nBest max_depth:", best_depth, "with mean RMSE =", round(results[best_depth], 3))

max_depth=10, mean RMSE = 0.436
max_depth=15, mean RMSE = 0.438
max_depth=20, mean RMSE = 0.438
max_depth=25, mean RMSE = 0.438

Best max_depth: 10 with mean RMSE = 0.436


In [16]:
model = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)

model.fit(X_train, y_full_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
feature_names = dv.get_feature_names_out()  # dv is your DictVectorizer
importances = model.feature_importances_

for name, importance in sorted(zip(feature_names, importances), key=lambda x: -x[1]):
    print(name, importance)

vehicle_weight 0.9598782143148441
horsepower 0.015933481489766168
acceleration 0.011442313735237557
engine_displacement 0.003159424030350312
model_year 0.003066056772754424
num_cylinders 0.0023231449140431217
num_doors 0.0015756291753787894
origin=USA 0.0004959383688367859
origin=Asia 0.00043094762405633503
origin=Europe 0.00041885303929771223
fuel_type=Diesel 0.00034918888161679113
drivetrain=All-wheel drive 0.0003428361850955144
fuel_type=Gasoline 0.0003071871419763815
drivetrain=Front-wheel drive 0.00027678432674605027


In [18]:
pip install xgboost


Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl (296.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.8/296.8 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [xgboost]━━━[0m [32m1/2[0m [xgboost]
[1A[2KSuccessfully installed nvidia-nccl-cu12-2.28.7 xgboost-3.1.1

[1m[[0

In [20]:
import xgboost as xgb

# DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_full_train)
dval = xgb.DMatrix(X_val, label=y_val)


In [21]:
watchlist = [(dtrain, 'train'), (dval, 'val')]


In [22]:
params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_03 = xgb.train(params, dtrain, num_boost_round=100, evals=watchlist)


[0]	train-rmse:1.83282	val-rmse:1.82567
[1]	train-rmse:1.33231	val-rmse:1.32771
[2]	train-rmse:0.99034	val-rmse:0.99257
[3]	train-rmse:0.76090	val-rmse:0.76897
[4]	train-rmse:0.61110	val-rmse:0.62742
[5]	train-rmse:0.51643	val-rmse:0.54010
[6]	train-rmse:0.45800	val-rmse:0.48954
[7]	train-rmse:0.42172	val-rmse:0.46026
[8]	train-rmse:0.39836	val-rmse:0.44332
[9]	train-rmse:0.38494	val-rmse:0.43456
[10]	train-rmse:0.37400	val-rmse:0.43004
[11]	train-rmse:0.36596	val-rmse:0.42696
[12]	train-rmse:0.36050	val-rmse:0.42569
[13]	train-rmse:0.35549	val-rmse:0.42519
[14]	train-rmse:0.35143	val-rmse:0.42455
[15]	train-rmse:0.34792	val-rmse:0.42450
[16]	train-rmse:0.34533	val-rmse:0.42478
[17]	train-rmse:0.34356	val-rmse:0.42472
[18]	train-rmse:0.34129	val-rmse:0.42472
[19]	train-rmse:0.33846	val-rmse:0.42502
[20]	train-rmse:0.33724	val-rmse:0.42509
[21]	train-rmse:0.33463	val-rmse:0.42538
[22]	train-rmse:0.33260	val-rmse:0.42543
[23]	train-rmse:0.33023	val-rmse:0.42611
[24]	train-rmse:0.32738	va

In [23]:
params['eta'] = 0.1
model_01 = xgb.train(params, dtrain, num_boost_round=100, evals=watchlist)


[0]	train-rmse:2.31334	val-rmse:2.30592
[1]	train-rmse:2.09552	val-rmse:2.08865
[2]	train-rmse:1.90001	val-rmse:1.89221
[3]	train-rmse:1.72438	val-rmse:1.71766
[4]	train-rmse:1.56719	val-rmse:1.56150
[5]	train-rmse:1.42645	val-rmse:1.42157
[6]	train-rmse:1.30047	val-rmse:1.29580
[7]	train-rmse:1.18786	val-rmse:1.18468
[8]	train-rmse:1.08744	val-rmse:1.08657
[9]	train-rmse:0.99801	val-rmse:0.99964
[10]	train-rmse:0.91846	val-rmse:0.92183
[11]	train-rmse:0.84797	val-rmse:0.85324
[12]	train-rmse:0.78540	val-rmse:0.79241
[13]	train-rmse:0.73026	val-rmse:0.73968
[14]	train-rmse:0.68164	val-rmse:0.69327
[15]	train-rmse:0.63889	val-rmse:0.65351
[16]	train-rmse:0.60130	val-rmse:0.61854
[17]	train-rmse:0.56852	val-rmse:0.58847
[18]	train-rmse:0.53982	val-rmse:0.56232
[19]	train-rmse:0.51488	val-rmse:0.53952
[20]	train-rmse:0.49316	val-rmse:0.52039
[21]	train-rmse:0.47428	val-rmse:0.50442
[22]	train-rmse:0.45775	val-rmse:0.49005
[23]	train-rmse:0.44362	val-rmse:0.47827
[24]	train-rmse:0.43128	va

In [24]:
from sklearn.metrics import mean_squared_error
import numpy as np

y_pred_03 = model_03.predict(dval)
y_pred_01 = model_01.predict(dval)

rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))

print("RMSE (eta=0.3):", rmse_03)
print("RMSE (eta=0.1):", rmse_01)


RMSE (eta=0.3): 0.44340462733166064
RMSE (eta=0.1): 0.4167428683326873
