In [1]:
import pandas as pd
import numpy as np

import seaborn as sn
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error

In [3]:
data='https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'

In [4]:
!wget $data

--2025-11-03 20:43:48--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-11-03 20:43:48 (21.7 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [5]:
df=pd.read_csv('car_fuel_efficiency.csv')

In [6]:
numerical = df.columns[df.dtypes != 'object']
numerical

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [7]:
categorical= df.columns[df.dtypes == 'object']
categorical

Index(['origin', 'fuel_type', 'drivetrain'], dtype='object')

In [8]:
for c in numerical:
    df[c]=df[c].fillna(0)

In [9]:
for c in categorical:
    df[c]=df[c].fillna('NA')

In [10]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [11]:
df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)

In [12]:
y_train=df_train['fuel_efficiency_mpg'].values
y_test=df_test['fuel_efficiency_mpg'].values
y_val=df_val['fuel_efficiency_mpg'].values

In [13]:
del df_train['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']

In [14]:
df.isna().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [17]:
dv= DictVectorizer(sparse=True)

In [15]:
train_dicts = df_train.to_dict(orient='records')

In [18]:
X_train = dv.fit_transform(train_dicts)

In [19]:
dt = DecisionTreeRegressor(max_depth=1)

In [20]:
dt

In [21]:
dt.fit(X_train, y_train)

In [22]:
val_dicts = df_val.to_dict(orient='records')

In [23]:
X_val = dv.transform(val_dicts)

In [24]:
y_pred = dt.predict(X_val)

In [25]:
feature_names_list = dv.get_feature_names_out().tolist()
feature_names_list

['acceleration',
 'drivetrain=All-wheel drive',
 'drivetrain=Front-wheel drive',
 'engine_displacement',
 'fuel_type=Diesel',
 'fuel_type=Gasoline',
 'horsepower',
 'model_year',
 'num_cylinders',
 'num_doors',
 'origin=Asia',
 'origin=Europe',
 'origin=USA',
 'vehicle_weight']

In [27]:
from sklearn.tree import export_text

In [28]:
tree_rules=export_text(dt, feature_names=feature_names_list)
print(tree_rules)

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [29]:
from sklearn.ensemble import RandomForestRegressor

In [30]:
rf= RandomForestRegressor(n_estimators=10,random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

In [31]:
y_pred = rf.predict(X_val)

In [32]:
from sklearn.metrics import mean_squared_error
from numpy import sqrt

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Root Mean Square : ", rmse)

Root Mean Square :  0.4588479076612982


In [33]:
import numpy as np

In [39]:
estimators = np.arange(10,210,10)

In [40]:
def random_forest_varied_estimator(value):
    rf= RandomForestRegressor(n_estimators=value,random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return round(rmse,3)

In [41]:
for estimator in estimators:
    print(f"Value : {estimator} " + f"rmse : {random_forest_varied_estimator(estimator)}")

Value : 10 rmse : 0.459
Value : 20 rmse : 0.453
Value : 30 rmse : 0.451
Value : 40 rmse : 0.448
Value : 50 rmse : 0.446
Value : 60 rmse : 0.445
Value : 70 rmse : 0.444
Value : 80 rmse : 0.445
Value : 90 rmse : 0.445
Value : 100 rmse : 0.445
Value : 110 rmse : 0.443
Value : 120 rmse : 0.444
Value : 130 rmse : 0.443
Value : 140 rmse : 0.443
Value : 150 rmse : 0.443
Value : 160 rmse : 0.443
Value : 170 rmse : 0.443
Value : 180 rmse : 0.442
Value : 190 rmse : 0.442
Value : 200 rmse : 0.442


In [42]:
depths = [10,15,20,25]
estimators = np.arange(10,210,10)

In [44]:
def random_forest_varied_depth_estimator(depth, estimator):
    rf= RandomForestRegressor(n_estimators=estimator,max_depth=depth, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse

In [45]:
rmse_summary = {}

for depth in depths:
    rmses=[]
    for estimator in estimators:
        rmse = random_forest_varied_depth_estimator(depth, estimator)
        rmses.append(rmse)
    rmse_summary[depth] = np.mean(rmses)

In [48]:
best_max_depth= min(rmse_summary, key=rmse_summary.get)
print(f"Best Max Depth : {best_max_depth} ")

Best Max Depth : 10 


In [49]:
rf= RandomForestRegressor(n_estimators=10,max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

In [52]:
feature_names = dv.get_feature_names_out().tolist

In [56]:
importances = rf.feature_importances_
most_important_index = np.argmax(importances)
most_important_index

13

In [58]:
feature_names_list = dv.get_feature_names_out().tolist()
feature_names_list

['acceleration',
 'drivetrain=All-wheel drive',
 'drivetrain=Front-wheel drive',
 'engine_displacement',
 'fuel_type=Diesel',
 'fuel_type=Gasoline',
 'horsepower',
 'model_year',
 'num_cylinders',
 'num_doors',
 'origin=Asia',
 'origin=Europe',
 'origin=USA',
 'vehicle_weight']

In [59]:
most_important_feature = feature_names_list[most_important_index]
most_important_feature

'vehicle_weight'