In [49]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

In [50]:
df = pd.read_csv('car_fuel_efficiency.csv')

create a regression model for predicting the car fuel efficiency (column 'fuel_efficiency_mpg').

In [51]:
# fill NaN values with 0
df = df.fillna(0)
df.isnull().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [52]:
df['num_doors'].value_counts()

num_doors
 0.0    4053
 1.0    2192
-1.0    2183
-2.0     594
 2.0     563
 3.0      58
-3.0      56
-4.0       4
 4.0       1
Name: count, dtype: int64

In [53]:
# some of the num_doors values are negative, i will convert them to positive
df['num_doors'] = df['num_doors'].abs()
df['num_doors'].value_counts()

num_doors
1.0    4375
0.0    4053
2.0    1157
3.0     114
4.0       5
Name: count, dtype: int64

In [54]:
# some cleaning for categorical columns
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.lower().str.strip().str.replace(' ', '_').str.replace('-', '_')

In [55]:
from sklearn.model_selection import train_test_split

X = df.drop('fuel_efficiency_mpg', axis=1)
y = df['fuel_efficiency_mpg']

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=1)

In [56]:
from sklearn.feature_extraction import DictVectorizer

train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')
test_dicts = X_test.to_dict(orient='records')

dv = DictVectorizer(sparse=True)

X_train_matrix = dv.fit_transform(train_dicts)

X_val_matrix = dv.transform(val_dicts)
X_test_matrix = dv.transform(test_dicts)


print(f"Feature names: {dv.get_feature_names_out()}")

Feature names: ['acceleration' 'drivetrain=all_wheel_drive'
 'drivetrain=front_wheel_drive' 'engine_displacement' 'fuel_type=diesel'
 'fuel_type=gasoline' 'horsepower' 'model_year' 'num_cylinders'
 'num_doors' 'origin=asia' 'origin=europe' 'origin=usa' 'vehicle_weight']


In [57]:
from sklearn.tree import DecisionTreeRegressor

# train the tree model
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_matrix, y_train)


y_train_pred = dt.predict(X_train_matrix)
y_val_pred = dt.predict(X_val_matrix)

In [58]:
# check first feature which is used for splitting
feature_idx = dt.tree_.feature[0]
feature_name = dv.get_feature_names_out()[feature_idx]

print(f"Feature used for splitting: {feature_name}")

Feature used for splitting: vehicle_weight


In [59]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train_matrix, y_train)


y_val_pred_rf = rf.predict(X_val_matrix)


val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))

print(round(val_rmse,3))

0.462


In [60]:
results = {}

# try different values of n_estimators from 10 to 200 with step 10
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train_matrix, y_train)
    
    y_val_pred = rf.predict(X_val_matrix)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    
    results[n] = round(rmse, 3)
    print(f"n_estimators={n:3d}, RMSE={rmse:.3f}")

min_rmse = min(results.values())
best_n = [n for n, rmse in results.items() if rmse == min_rmse][0]

print(f"\nBest n_estimators: {best_n}")
print(f"Best RMSE: {min_rmse:.3f}")

# find when RMSE stops improving (first occurrence of minimum)
previous_rmse = float('inf')
for n in sorted(results.keys()):
    if results[n] >= previous_rmse:
        print(f"\nRMSE stops improving after n_estimators={n-10}")
        break
    previous_rmse = results[n]

n_estimators= 10, RMSE=0.462
n_estimators= 20, RMSE=0.453
n_estimators= 30, RMSE=0.451
n_estimators= 40, RMSE=0.448
n_estimators= 50, RMSE=0.447
n_estimators= 60, RMSE=0.446
n_estimators= 70, RMSE=0.445
n_estimators= 80, RMSE=0.445
n_estimators= 90, RMSE=0.445
n_estimators=100, RMSE=0.445
n_estimators=110, RMSE=0.444
n_estimators=120, RMSE=0.444
n_estimators=130, RMSE=0.444
n_estimators=140, RMSE=0.444
n_estimators=150, RMSE=0.443
n_estimators=160, RMSE=0.443
n_estimators=170, RMSE=0.443
n_estimators=180, RMSE=0.442
n_estimators=190, RMSE=0.443
n_estimators=200, RMSE=0.442

Best n_estimators: 180
Best RMSE: 0.442

RMSE stops improving after n_estimators=70


In [61]:
depth_results = {}

# different max_depth values
for depth in [10, 15, 20, 25]:
    rmse_scores = []
    
    print(f"\nTesting max_depth={depth}")
    print("-" * 40)
    
    # for each max_depth, try different n_estimators
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n, 
            max_depth=depth,
            random_state=1, 
            n_jobs=-1
        )
        rf.fit(X_train_matrix, y_train)
        
        y_val_pred = rf.predict(X_val_matrix)
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        rmse_scores.append(rmse)
        
        print(f"  n_estimators={n:3d}, RMSE={rmse:.3f}")
    
    # calculate mean RMSE for this max_depth
    mean_rmse = np.mean(rmse_scores)
    depth_results[depth] = mean_rmse
    print(f"  Mean RMSE: {mean_rmse:.3f}")

# find the best max_depth
print("\n" + "=" * 40)
print("Summary:")
print("=" * 40)
for depth, mean_rmse in sorted(depth_results.items()):
    print(f"max_depth={depth:2d}: Mean RMSE = {mean_rmse:.3f}")

best_depth = min(depth_results, key=depth_results.get)
print(f"\nBest max_depth: {best_depth}")
print(f"Best mean RMSE: {depth_results[best_depth]:.3f}")


Testing max_depth=10
----------------------------------------
  n_estimators= 10, RMSE=0.452
  n_estimators= 20, RMSE=0.448
  n_estimators= 30, RMSE=0.446
  n_estimators= 40, RMSE=0.443
  n_estimators= 50, RMSE=0.442
  n_estimators= 60, RMSE=0.442
  n_estimators= 70, RMSE=0.441
  n_estimators= 80, RMSE=0.442
  n_estimators= 90, RMSE=0.442
  n_estimators=100, RMSE=0.441
  n_estimators=110, RMSE=0.440
  n_estimators=120, RMSE=0.441
  n_estimators=130, RMSE=0.440
  n_estimators=140, RMSE=0.440
  n_estimators=150, RMSE=0.440
  n_estimators=160, RMSE=0.440
  n_estimators=170, RMSE=0.440
  n_estimators=180, RMSE=0.440
  n_estimators=190, RMSE=0.440
  n_estimators=200, RMSE=0.440
  Mean RMSE: 0.442

Testing max_depth=15
----------------------------------------
  n_estimators= 10, RMSE=0.459
  n_estimators= 20, RMSE=0.451
  n_estimators= 30, RMSE=0.450
  n_estimators= 40, RMSE=0.447
  n_estimators= 50, RMSE=0.446
  n_estimators= 60, RMSE=0.445
  n_estimators= 70, RMSE=0.444
  n_estimators= 80

In [62]:
rf_final = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf_final.fit(X_train_matrix, y_train)

# feature importances
feature_importances = rf_final.feature_importances_
feature_names = dv.get_feature_names_out()


importance_dict = dict(zip(feature_names, feature_importances))

specified_features = ['vehicle_weight', 'horsepower', 'acceleration', 'engine_displacement']
specified_importances = {feat: importance_dict[feat] for feat in specified_features if feat in importance_dict}

sorted_features = sorted(specified_importances.items(), key=lambda x: x[1], reverse=True)

print("Feature Importances:")
print("-" * 40)
for feat, imp in sorted_features:
    print(f"{feat:20s}: {imp:.6f}")

print(f"\nMost important feature: {sorted_features[0][0]}")
print(f"Importance: {sorted_features[0][1]:.6f}")

Feature Importances:
----------------------------------------
vehicle_weight      : 0.959185
horsepower          : 0.016133
acceleration        : 0.011565
engine_displacement : 0.003372

Most important feature: vehicle_weight
Importance: 0.959185


In [63]:
import xgboost as xgb

In [64]:
dtrain = xgb.DMatrix(X_train_matrix, label=y_train)
dval = xgb.DMatrix(X_val_matrix, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'val')]

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=100,
    evals=watchlist,
    verbose_eval=10
)

y_val_pred_xgb = model.predict(dval)

val_rmse_xgb = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
print(f"\nFinal Validation RMSE: {val_rmse_xgb:.3f}")

[0]	train-rmse:1.81393	val-rmse:1.85444


[10]	train-rmse:0.37182	val-rmse:0.43794
[20]	train-rmse:0.33561	val-rmse:0.43313
[30]	train-rmse:0.31474	val-rmse:0.43583
[40]	train-rmse:0.29648	val-rmse:0.43903
[50]	train-rmse:0.28020	val-rmse:0.44149
[60]	train-rmse:0.26387	val-rmse:0.44257
[70]	train-rmse:0.25334	val-rmse:0.44373
[80]	train-rmse:0.24186	val-rmse:0.44629
[90]	train-rmse:0.23080	val-rmse:0.44812
[99]	train-rmse:0.22071	val-rmse:0.44900

Final Validation RMSE: 0.449


In [65]:
# try with eta=0.1 and compare RMSE
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}


model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=100,
    evals=watchlist,
    verbose_eval=10
)


y_val_pred_xgb = model.predict(dval)


val_rmse_xgb = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
print(f"\nFinal Validation RMSE: {val_rmse_xgb:.3f}")

[0]	train-rmse:2.28944	val-rmse:2.34561
[10]	train-rmse:0.91010	val-rmse:0.94065
[20]	train-rmse:0.49011	val-rmse:0.53109
[30]	train-rmse:0.38429	val-rmse:0.44318
[40]	train-rmse:0.35427	val-rmse:0.42844
[50]	train-rmse:0.33945	val-rmse:0.42633
[60]	train-rmse:0.32860	val-rmse:0.42621
[70]	train-rmse:0.32022	val-rmse:0.42681
[80]	train-rmse:0.31571	val-rmse:0.42735
[90]	train-rmse:0.31136	val-rmse:0.42805
[99]	train-rmse:0.30684	val-rmse:0.42860

Final Validation RMSE: 0.429
