In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df= pd.read_csv(" https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")


In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
engine_displacement,170,130,170,220,210
num_cylinders,3.0,5.0,,4.0,1.0
horsepower,159.0,97.0,78.0,,140.0
vehicle_weight,3413.433759,3149.664934,3079.038997,2542.392402,3460.87099
acceleration,17.7,17.8,15.1,20.2,14.4
model_year,2003,2007,2018,2009,2009
origin,Europe,USA,Europe,USA,Europe
fuel_type,Gasoline,Gasoline,Gasoline,Diesel,Gasoline
drivetrain,All-wheel drive,Front-wheel drive,Front-wheel drive,All-wheel drive,All-wheel drive
num_doors,0.0,0.0,0.0,2.0,2.0


In [4]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [5]:
df.dtypes

engine_displacement      int64
num_cylinders          float64
horsepower             float64
vehicle_weight         float64
acceleration           float64
model_year               int64
origin                  object
fuel_type               object
drivetrain              object
num_doors              float64
fuel_efficiency_mpg    float64
dtype: object

In [6]:
df[df.select_dtypes(include='number').columns] = df.select_dtypes(include='number').fillna(0.0)

In [7]:
df.isnull().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [8]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 1)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1)

In [9]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor

In [10]:
X_train= df_train.drop(columns=['fuel_efficiency_mpg'])
X_test= df_test.drop(columns=['fuel_efficiency_mpg'])
X_val= df_val.drop(columns=['fuel_efficiency_mpg'])

In [11]:
X_train_dicts = X_train.to_dict(orient='records')
X_val_dicts = X_val.to_dict(orient='records')
X_test_dicts = X_test.to_dict(orient='records')


In [12]:
vec = DictVectorizer(sparse=False)

X_train_vec = vec.fit_transform(X_train_dicts)
X_val_vec = vec.transform(X_val_dicts)
X_test_vec= vec.transform(X_test_dicts)


## Question 1

In [13]:
features = ['vehicle_weight', 'model_year', 'origin']
X_train = df_train[features]
y_train = df_train['fuel_efficiency_mpg']

In [14]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train_vec, y_train)


In [15]:
feature_names = vec.get_feature_names_out()
importance = dt.feature_importances_

top_feature_index = np.argmax(importance)
print("Feature used for splitting:", feature_names[top_feature_index])

Feature used for splitting: vehicle_weight


## Question 2

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [17]:
y_val = df_val['fuel_efficiency_mpg']
X_train = df_train.drop(columns=['fuel_efficiency_mpg'])

In [18]:
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train_vec, y_train)

In [19]:
y_pred = rf.predict(X_val_vec)


In [20]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("RMSE:", rmse)

RMSE: 0.4599777557336148


## Question 3

In [21]:
rmse_values = {}


In [22]:
for n in range(10, 201, 10):
    rf = RandomForestRegressor(
        n_estimators=n,
        random_state=1,
        n_jobs=-1
    )
    rf.fit(X_train_vec, y_train)
    y_pred = rf.predict(X_val_vec)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_values[n] = rmse
    print(f"n_estimators={n}: RMSE={rmse:.3f}")

n_estimators=10: RMSE=0.460
n_estimators=20: RMSE=0.454
n_estimators=30: RMSE=0.451
n_estimators=40: RMSE=0.448
n_estimators=50: RMSE=0.446
n_estimators=60: RMSE=0.445
n_estimators=70: RMSE=0.445
n_estimators=80: RMSE=0.445
n_estimators=90: RMSE=0.445
n_estimators=100: RMSE=0.444
n_estimators=110: RMSE=0.443
n_estimators=120: RMSE=0.444
n_estimators=130: RMSE=0.443
n_estimators=140: RMSE=0.443
n_estimators=150: RMSE=0.443
n_estimators=160: RMSE=0.443
n_estimators=170: RMSE=0.443
n_estimators=180: RMSE=0.442
n_estimators=190: RMSE=0.443
n_estimators=200: RMSE=0.443


### The answer is 200

## Question 4

In [23]:
max_depth_values = [10, 15, 20, 25]
n_estimators_values = range(10, 201, 10)
results={}

In [26]:
for depth in max_depth_values:
    rmse_list = []
    for n in n_estimators_values:
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train_vec, y_train)
        y_pred = rf.predict(X_val_vec)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    results[depth] = np.mean(rmse_list)
    print(f"max_depth={depth}: mean RMSE={np.mean(rmse_list):.3f}")

max_depth=10: mean RMSE=0.442
max_depth=15: mean RMSE=0.445
max_depth=20: mean RMSE=0.446
max_depth=25: mean RMSE=0.446


In [25]:
best_depth = min(results, key=results.get)
print(f"\n✅ Best max_depth = {best_depth}, Mean RMSE = {results[best_depth]:.3f}")


✅ Best max_depth = 10, Mean RMSE = 0.442


## Question 5

In [41]:
X_model=df[['vehicle_weight','horsepower','acceleration','engine_displacement']]

In [42]:
model = RandomForestRegressor(
    n_estimators = 10,
    max_depth = 20,
    random_state = 1,
    n_jobs = -1
)
model.fit(X_train_vec, y_train)

In [43]:
importances = model.feature_importances_
for name, importance in zip(X_model.columns, importances):
    print(f"{name}: {importance:.4f}")


vehicle_weight: 0.0115
horsepower: 0.0004
acceleration: 0.0003
engine_displacement: 0.0033


In [44]:
most_important = feature_names[importances.argmax()]
print("\nMost important feature:", most_important)


Most important feature: vehicle_weight


## Question 6

In [53]:
import xgboost as xgb

In [57]:
dtrain = xgb.DMatrix(X_train_vec, label=y_train)
dval = xgb.DMatrix(X_val_vec, label=y_val)

In [58]:
watchlist = [(dtrain, 'train'), (dval, 'validation')]

In [59]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

In [60]:
model = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist
)

[0]	train-rmse:1.81393	validation-rmse:1.85444
[1]	train-rmse:1.31919	validation-rmse:1.35353
[2]	train-rmse:0.98120	validation-rmse:1.01316
[3]	train-rmse:0.75443	validation-rmse:0.78667
[4]	train-rmse:0.60680	validation-rmse:0.64318
[5]	train-rmse:0.51381	validation-rmse:0.55664
[6]	train-rmse:0.45470	validation-rmse:0.50321
[7]	train-rmse:0.41881	validation-rmse:0.47254
[8]	train-rmse:0.39534	validation-rmse:0.45509
[9]	train-rmse:0.38038	validation-rmse:0.44564
[10]	train-rmse:0.37115	validation-rmse:0.43896
[11]	train-rmse:0.36361	validation-rmse:0.43594
[12]	train-rmse:0.35850	validation-rmse:0.43558
[13]	train-rmse:0.35365	validation-rmse:0.43394
[14]	train-rmse:0.35025	validation-rmse:0.43349
[15]	train-rmse:0.34666	validation-rmse:0.43362
[16]	train-rmse:0.34459	validation-rmse:0.43378
[17]	train-rmse:0.34128	validation-rmse:0.43405
[18]	train-rmse:0.33822	validation-rmse:0.43391
[19]	train-rmse:0.33709	validation-rmse:0.43374
[20]	train-rmse:0.33553	validation-rmse:0.43376
[2

In [51]:
xgb_params['eta'] = 0.1

In [61]:
model_low_eta = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist
)

[0]	train-rmse:1.81393	validation-rmse:1.85444
[1]	train-rmse:1.31919	validation-rmse:1.35353
[2]	train-rmse:0.98120	validation-rmse:1.01316
[3]	train-rmse:0.75443	validation-rmse:0.78667
[4]	train-rmse:0.60680	validation-rmse:0.64318
[5]	train-rmse:0.51381	validation-rmse:0.55664
[6]	train-rmse:0.45470	validation-rmse:0.50321
[7]	train-rmse:0.41881	validation-rmse:0.47254
[8]	train-rmse:0.39534	validation-rmse:0.45509
[9]	train-rmse:0.38038	validation-rmse:0.44564
[10]	train-rmse:0.37115	validation-rmse:0.43896
[11]	train-rmse:0.36361	validation-rmse:0.43594
[12]	train-rmse:0.35850	validation-rmse:0.43558
[13]	train-rmse:0.35365	validation-rmse:0.43394
[14]	train-rmse:0.35025	validation-rmse:0.43349
[15]	train-rmse:0.34666	validation-rmse:0.43362
[16]	train-rmse:0.34459	validation-rmse:0.43378
[17]	train-rmse:0.34128	validation-rmse:0.43405
[18]	train-rmse:0.33822	validation-rmse:0.43391
[19]	train-rmse:0.33709	validation-rmse:0.43374
[20]	train-rmse:0.33553	validation-rmse:0.43376
[2

### The answer is 0.1