In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor

In [26]:
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
df = pd.read_csv(url)
df.to_csv('car_fuel_efficiency.csv', index=False)

In [27]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [28]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [30]:
print(df.isnull().sum())

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64


In [31]:
df = df.fillna(0)

In [32]:
y = df['fuel_efficiency_mpg'].values
X = df.drop('fuel_efficiency_mpg', axis=1)

In [33]:
X_train_full, X_temp, y_train_full, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=1
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

In [34]:
train_dicts = X_train_full.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')
test_dicts = X_test.to_dict(orient='records')

In [35]:
dv = DictVectorizer(sparse=True)
X_train_sparse = dv.fit_transform(train_dicts)
X_val_sparse = dv.transform(val_dicts)
X_test_sparse = dv.transform(test_dicts)

In [36]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_sparse, y_train_full)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [37]:
feature_names = dv.get_feature_names_out()
feature_idx = dt.tree_.feature[0] 
split_feature = feature_names[feature_idx]

In [39]:
# question 1
split_feature

'vehicle_weight'

In [None]:
# question 2

In [42]:
from sklearn.ensemble import RandomForestRegressor

In [43]:
dt = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
dt.fit(X_train_sparse, y_train_full)
y_pred = dt.predict(X_val_sparse)
rmse = np.sqrt(np.mean((y_val - y_pred) ** 2))
rmse

np.float64(0.461004655378348)

In [None]:
# question 3

In [48]:
from sklearn.metrics import mean_squared_error
import math

In [49]:
n_estimators_values = list(range(10, 210, 10))  # 10, 20, 30, ..., 200
results = []

In [50]:
previous_rmse = None

for n_est in n_estimators_values:
    rf = RandomForestRegressor(
        n_estimators=n_est,
        random_state=1,
        n_jobs=-1
    )
    
    rf.fit(X_train_sparse, y_train_full)
    y_pred_val = rf.predict(X_val_sparse)
    mse_val = mean_squared_error(y_val, y_pred_val)
    rmse_val = math.sqrt(mse_val)
    
    results.append({
        'n_estimators': n_est,
        'rmse': rmse_val,
        'rmse_rounded': round(rmse_val, 3)
    })
    
    if previous_rmse is not None:
        change = rmse_val - previous_rmse
        change_str = f"({change:+.4f})"
    else:
        change_str = "(baseline)"
    
    print(f"{n_est:<15} {rmse_val:<20.4f} {change_str:<20}")
    previous_rmse = rmse_val

10              0.4610               (baseline)          
20              0.4462               (-0.0148)           
30              0.4400               (-0.0062)           
40              0.4389               (-0.0011)           
50              0.4376               (-0.0014)           
60              0.4361               (-0.0014)           
70              0.4366               (+0.0004)           
80              0.4366               (+0.0001)           
90              0.4360               (-0.0006)           
100             0.4359               (-0.0001)           
110             0.4356               (-0.0004)           
120             0.4362               (+0.0006)           
130             0.4357               (-0.0005)           
140             0.4359               (+0.0002)           
150             0.4358               (-0.0001)           
160             0.4358               (+0.0000)           
170             0.4357               (-0.0001)           
180           

In [52]:
# question 4

In [53]:
max_depth_values = [10, 15, 20, 25]
n_estimators_values = list(range(10, 210, 10))  # 10, 20, 30, ..., 200

depth_results = {}

for max_depth in max_depth_values:
    print(f"\n{'='*80}")
    print(f"Test max_depth = {max_depth}")
    print(f"{'='*80}")
    
    rmse_scores = []
    
    print(f"{'n_estimators':<15} {'RMSE (val)':<20}")
    print("-"*80)
    
    for n_est in n_estimators_values:
        rf = RandomForestRegressor(
            max_depth=max_depth,
            n_estimators=n_est,
            random_state=1,
            n_jobs=-1
        )
        
        rf.fit(X_train_sparse, y_train_full)
        y_pred_val = rf.predict(X_val_sparse)
        mse_val = mean_squared_error(y_val, y_pred_val)
        rmse_val = math.sqrt(mse_val)
        
        rmse_scores.append(rmse_val)
        
        if n_est % 50 == 10 or n_est == 10:  
            print(f"{n_est:<15} {rmse_val:<20.4f}")
    
    mean_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    min_rmse = np.min(rmse_scores)
    max_rmse = np.max(rmse_scores)
    
    depth_results[max_depth] = {
        'mean_rmse': mean_rmse,
        'std_rmse': std_rmse,
        'min_rmse': min_rmse,
        'max_rmse': max_rmse,
        'rmse_scores': rmse_scores
    }
    
    print(f"\nmax_depth = {max_depth}:")
    print(f"  Mean RMSE (mean):   {mean_rmse:.4f}")
    print(f"  std:      {std_rmse:.4f}")
    print(f"  Min RMSE:             {min_rmse:.4f}")
    print(f"  Max RMSE:             {max_rmse:.4f}")


Test max_depth = 10
n_estimators    RMSE (val)          
--------------------------------------------------------------------------------
10              0.4510              
60              0.4351              
110             0.4341              
160             0.4345              

max_depth = 10:
  Mean RMSE (mean):   0.4361
  std:      0.0039
  Min RMSE:             0.4341
  Max RMSE:             0.4510

Test max_depth = 15
n_estimators    RMSE (val)          
--------------------------------------------------------------------------------
10              0.4603              
60              0.4360              
110             0.4356              
160             0.4360              

max_depth = 15:
  Mean RMSE (mean):   0.4383
  std:      0.0057
  Min RMSE:             0.4355
  Max RMSE:             0.4603

Test max_depth = 20
n_estimators    RMSE (val)          
--------------------------------------------------------------------------------
10              0.4602           

In [54]:
best_depth = None
best_mean_rmse = float('inf')

for max_depth in max_depth_values:
    mean_rmse = depth_results[max_depth]['mean_rmse']
    std_rmse = depth_results[max_depth]['std_rmse']
    min_rmse = depth_results[max_depth]['min_rmse']
    
    print(f"{max_depth:<15} {mean_rmse:<20.4f} {std_rmse:<20.4f} {min_rmse:<20.4f}")
    
    if mean_rmse < best_mean_rmse:
        best_mean_rmse = mean_rmse
        best_depth = max_depth

10              0.4361               0.0039               0.4341              
15              0.4383               0.0057               0.4355              
20              0.4378               0.0058               0.4350              
25              0.4382               0.0060               0.4354              


In [55]:
best_depth

10

In [57]:
# question 5

In [58]:
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)

print("\nTraining Random Forest model...")
rf.fit(X_train_sparse, y_train_full)
print("Model trained!")

# Calculate performance on validation set
y_pred_val = rf.predict(X_val_sparse)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = math.sqrt(mse_val)
print(f"\nRMSE on validation set: {rmse_val:.4f}")

# 8. Extract feature_importances_
feature_importances = rf.feature_importances_

print("\n" + "="*80)
print("FEATURE IMPORTANCES")
print("="*80)

# Create DataFrame with importances
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

print("\nAll features sorted by importance:")
print("-"*80)
print(f"{'Rank':<5} {'Feature':<30} {'Importance':<20} {'Percentage':<15}")
print("-"*80)

for idx, (_, row) in enumerate(importance_df.iterrows(), 1):
    percent = row['importance'] * 100
    print(f"{idx:<5} {row['feature']:<30} {row['importance']:<20.6f} {percent:<15.2f}%")

# 9. Find importance of specific features from the answer options
options = ['vehicle_weight', 'horsepower', 'acceleration', 'engine_displacement']

print("\n" + "="*80)
print("IMPORTANCE OF ANSWER OPTIONS")
print("="*80)

importance_of_options = {}

for option in options:
    if option in feature_names:
        idx = list(feature_names).index(option)
        importance = feature_importances[idx]
        importance_of_options[option] = importance
        print(f"{option:<30} Importance: {importance:.6f}")
    else:
        print(f"{option:<30} NOT FOUND IN FEATURES")
        # Try to find similar features
        similar = [f for f in feature_names if option.lower() in f.lower()]
        if similar:
            print(f"  Possible alternatives: {similar}")

# 10. Find the most important feature among the options
print("\n" + "="*80)
print("ANSWER TO QUESTION 5:")
print("="*80)

if importance_of_options:
    most_important_option = max(importance_of_options.items(), key=lambda x: x[1])
    most_important_feature = most_important_option[0]
    most_important_value = most_important_option[1]
    
    print(f"Most important feature: {most_important_feature}")
    print(f"Importance: {most_important_value:.6f}")
    print("="*80)
    
    # Ranking of options
    print(f"\nRanking of options (from most to least important):")
    sorted_options = sorted(importance_of_options.items(), key=lambda x: x[1], reverse=True)
    for rank, (feature, importance) in enumerate(sorted_options, 1):
        print(f"  {rank}. {feature:<30} (importance: {importance:.6f})")
else:
    print("Options not found in available features!")

# 11. Top 5 most important features (all)
print("\n" + "="*80)
print("TOP 5 MOST IMPORTANT FEATURES (ALL)")
print("="*80)

top_5 = importance_df.head(5)
for idx, (_, row) in enumerate(top_5.iterrows(), 1):
    percent = row['importance'] * 100
    print(f"{idx}. {row['feature']:<30} (importance: {row['importance']:.6f}, {percent:.2f}%)")


Training Random Forest model...
Model trained!

RMSE on validation set: 0.4602

FEATURE IMPORTANCES

All features sorted by importance:
--------------------------------------------------------------------------------
Rank  Feature                        Importance           Percentage     
--------------------------------------------------------------------------------
1     vehicle_weight                 0.959859             95.99          %
2     horsepower                     0.015968             1.60           %
3     acceleration                   0.011432             1.14           %
4     engine_displacement            0.003182             0.32           %
5     model_year                     0.003069             0.31           %
6     num_cylinders                  0.002301             0.23           %
7     num_doors                      0.001563             0.16           %
8     origin=USA                     0.000490             0.05           %
9     origin=Asia          

In [59]:
# question 6

In [64]:
import xgboost as xgb

In [66]:
# Create DMatrix for train and validation
dtrain = xgb.DMatrix(X_train_sparse, label=y_train_full)
dval = xgb.DMatrix(X_val_sparse, label=y_val)

# Dictionary to store results
eta_results = {}

# Test different eta values
eta_values = [0.3, 0.1]

for eta in eta_values:
    print(f"\n{'='*80}")
    print(f"Testing eta = {eta}")
    print(f"{'='*80}")
    
    # Define XGBoost parameters
    xgb_params = {
        'eta': eta, 
        'max_depth': 6,
        'min_child_weight': 1,
        
        'objective': 'reg:squarederror',
        'nthread': 8,
        
        'seed': 1,
        'verbosity': 1,
    }
    
    # Create watchlist for monitoring
    watchlist = [(dtrain, 'train'), (dval, 'val')]
    
    # Train the model for 100 rounds
    print(f"\nTraining model with eta={eta} for 100 rounds...")
    model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=100,
        evals=watchlist,
        verbose_eval=20  # Print evaluation every 20 rounds
    )
    
    # Get predictions on validation set
    y_pred_val = model.predict(dval)
    
    # Calculate RMSE on validation set
    mse_val = mean_squared_error(y_val, y_pred_val)
    rmse_val = math.sqrt(mse_val)
    
    # Store results
    eta_results[eta] = {
        'model': model,
        'y_pred_val': y_pred_val,
        'rmse_val': rmse_val,
        'num_rounds': 100
    }
    
    print(f"\nResults for eta = {eta}:")
    print(f"  RMSE on validation set: {rmse_val:.4f}")
    print(f"  Total boosting rounds: 100")

# 8. Compare results
print("\n" + "="*80)
print("COMPARISON OF ETA VALUES")
print("="*80)

print(f"\n{'ETA':<10} {'Validation RMSE':<20}")
print("-"*80)

best_eta = None
best_rmse = float('inf')

for eta in sorted(eta_results.keys()):
    rmse = eta_results[eta]['rmse_val']
    print(f"{eta:<10} {rmse:<20.4f}")
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_eta = eta

# 9. Determine the answer
print("\n" + "="*80)
print("ANSWER TO QUESTION 6:")
print("="*80)

rmse_03 = eta_results[0.3]['rmse_val']
rmse_01 = eta_results[0.1]['rmse_val']
rmse_diff = abs(rmse_03 - rmse_01)

print(f"\nRMSE with eta=0.3: {rmse_03:.6f}")
print(f"RMSE with eta=0.1: {rmse_01:.6f}")
print(f"Difference: {rmse_diff:.6f}")

# Define tolerance for "equal values"
tolerance = 0.0001

if rmse_diff < tolerance:
    answer = "Both give equal value"
elif rmse_03 < rmse_01:
    answer = "0.3"
else:
    answer = "0.1"

print(f"\nBest eta: {best_eta}")
print(f"Best RMSE: {best_rmse:.6f}")
print(f"\nAnswer: {answer}")
print("="*80)

# 10. Feature importance from the best model
print("\n" + "="*80)
print(f"FEATURE IMPORTANCE FROM BEST MODEL (eta={best_eta})")
print("="*80)

best_model = eta_results[best_eta]['model']
feature_names = dv.get_feature_names_out()

# Get feature importances
try:
    importances = best_model.get_score(importance_type='weight')
    
    # Convert to DataFrame for better visualization
    importance_df = pd.DataFrame(
        list(importances.items()),
        columns=['feature_index', 'importance']
    ).sort_values('importance', ascending=False)
    
    # Map feature indices to feature names
    importance_df['feature_name'] = importance_df['feature_index'].str.replace('f', '').astype(int).apply(
        lambda x: feature_names[x] if x < len(feature_names) else f"f{x}"
    )
    
    print("\nTop 10 most important features:")
    print("-"*80)
    print(f"{'Rank':<5} {'Feature':<30} {'Importance':<15}")
    print("-"*80)
    
    for idx, (_, row) in enumerate(importance_df.head(10).iterrows(), 1):
        print(f"{idx:<5} {row['feature_name']:<30} {row['importance']:<15.2f}")
except Exception as e:
    print(f"Could not extract feature importances: {e}")

# 11. Additional diagnostics
print("\n" + "="*80)
print("MODEL TRAINING SUMMARY")
print("="*80)

for eta in eta_values:
    model = eta_results[eta]['model']
    rmse = eta_results[eta]['rmse_val']
    num_rounds = eta_results[eta]['num_rounds']
    
    print(f"\nETA = {eta}:")
    print(f"  Final Validation RMSE: {rmse:.6f}")
    print(f"  Boosting rounds: {num_rounds}")


Testing eta = 0.3

Training model with eta=0.3 for 100 rounds...
[0]	train-rmse:1.83282	val-rmse:1.82567


[20]	train-rmse:0.33724	val-rmse:0.42509
[40]	train-rmse:0.29886	val-rmse:0.43086
[60]	train-rmse:0.26525	val-rmse:0.43583
[80]	train-rmse:0.23379	val-rmse:0.43902
[99]	train-rmse:0.20896	val-rmse:0.44340

Results for eta = 0.3:
  RMSE on validation set: 0.4434
  Total boosting rounds: 100

Testing eta = 0.1

Training model with eta=0.1 for 100 rounds...
[0]	train-rmse:2.31334	val-rmse:2.30592
[20]	train-rmse:0.49316	val-rmse:0.52039
[40]	train-rmse:0.35546	val-rmse:0.41889
[60]	train-rmse:0.33254	val-rmse:0.41610
[80]	train-rmse:0.32012	val-rmse:0.41654
[99]	train-rmse:0.31183	val-rmse:0.41674

Results for eta = 0.1:
  RMSE on validation set: 0.4167
  Total boosting rounds: 100

COMPARISON OF ETA VALUES

ETA        Validation RMSE     
--------------------------------------------------------------------------------
0.1        0.4167              
0.3        0.4434              

ANSWER TO QUESTION 6:

RMSE with eta=0.3: 0.443405
RMSE with eta=0.1: 0.416743
Difference: 0.026662

Best e