## Preparing the dataset

In [1]:
# The goal of this homework is to create a regression model for predicting the car fuel efficiency (column 'fuel_efficiency_mpg').

# Preparing the dataset
# Preparation:

# Fill missing values with zeros.
# Do train/validation/test split with 60%/20%/20% distribution.
# Use the train_test_split function and set the random_state parameter to 1.
# Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

import pandas as pd

df = pd.read_csv("car_fuel_efficiency.csv")
df.tail()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.52739,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551
9703,270,3.0,140.0,2908.043477,14.7,2005,Asia,Diesel,All-wheel drive,-1.0,14.884467


In [17]:
# NB - I also see that num_doors is negative - better fill it with zeroes too...

# df.describe()

In [18]:
df.num_doors = abs(df.num_doors)
# df.describe()
# now problem fixed

In [4]:
# missing values - count and fill missing values with zeros.
df.isna().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [5]:
# replace NaN or missing with zeros
df = df.fillna(0)
df.isna().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [15]:
# df.describe()
# check - looks good

In [7]:
# Do train/validation/test split with 60%/20%/20% distribution.
# Use the train_test_split function and set the random_state parameter to 1.

from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [8]:
len(df_full_train), len(df_test) # (7763, 1941)

(7763, 1941)

In [9]:
len(df_train), len(df_val)  # (5822, 1941)

(5822, 1941)

In [10]:
# reset index in our splits

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [11]:
# now prepare our label (target variable) = fuel_efficiency_mpg
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values
len(y_train), len(y_val), len(y_test)
# (5822, 1941, 1941)

(5822, 1941, 1941)

In [12]:
# remove our label fuel_efficiency_mpg so model cannot learn from it accidentally

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']


In [16]:
# check all good
# df_train.head()

In [21]:
# Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

# we use many categorical features, thus one-hot encoding is needed:
from sklearn.feature_extraction import DictVectorizer

# then we need baked-in SKL metrics so we save time by not hand-coding them:
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

train_dicts = df_train.to_dict(orient='records')
# train_dicts[:5] # quick check - all looks good


In [33]:
# Use DictVectorizer(sparse=True) to turn the dataframes into matrices.
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts) 
# used DictVectorizer for 1-hot encoding of our training dataset

In [34]:
# CHECK 1:
dv.get_feature_names_out() # after 1-hot encoding how our features look like...
# old version get_feature_names is deprecated, new version is dv.get_feature_names_out()

array(['acceleration', 'drivetrain=All-wheel drive',
       'drivetrain=Front-wheel drive', 'engine_displacement',
       'fuel_type=Diesel', 'fuel_type=Gasoline', 'horsepower',
       'model_year', 'num_cylinders', 'num_doors', 'origin=Asia',
       'origin=Europe', 'origin=USA', 'vehicle_weight'], dtype=object)

In [35]:
X_train[0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10 stored elements and shape (1, 14)>

In [37]:
# # CHECK 2 - does not work if sparse = true:
# import numpy as np
# for name, value in zip(dv.get_feature_names_out(), X_train[0]):
#     print(name, value) 
#     # prints our first row nicely using python zip() function - manual check...

## Question 1

In [39]:
# Question 1
# Let's train a decision tree regressor to predict the fuel_efficiency_mpg variable.

# Train a model with max_depth=1.
# Which feature is used for splitting the data?

# 'vehicle_weight' === vehicle_weight <= 3022.11
# 'model_year'
# 'origin'
# 'fuel_type'

# we are building a regressor model - to predict fuel efficiency:
from sklearn.tree import DecisionTreeRegressor


In [40]:

# train the decision tree with max_depth=1.

dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train) # NB - max_depth 1

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [41]:
# now how to find which feature is used for splitting the data -
from sklearn.tree import export_text
# and get weight coefficients for our decisions
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



## Question 2

In [45]:
# Question 2
# Train a random forest regressor with these parameters:

# n_estimators=10
# random_state=1
# n_jobs=-1 (optional - to make training faster)
# What's the RMSE of this model on the validation data?

# 0.045
# 0.45 == RMSE: 0.4618
# 4.5 
# 45.0

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1  
)
rf.fit(X_train, y_train)


0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [49]:
# prepare X_val

val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

y_pred = rf.predict(X_val)

# What's the RMSE of random forest regressor model on the validation data?
from sklearn.metrics import mean_squared_error

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE: {rmse:.4f}")


RMSE: 0.4618


In [52]:
# Check myself another way
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

score = round(rmse(y_val, y_pred), 4)
float(score) # 0.4618 - same as before...

0.4618