In [23]:
import pandas as pd

df = pd.read_csv('./resources/datasets/cars_2010_2020.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Size (L),Fuel Type,Price (USD)
0,Volkswagen,Jetta,2010,4.2,Petrol,54073.09
1,Honda,Pilot,2017,4.2,Hybrid,44924.91
2,Nissan,Murano,2011,4.2,Hybrid,76963.44
3,Toyota,RAV4,2010,2.4,Petrol,30871.25
4,Nissan,Altima,2010,3.6,Petrol,72037.65


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Make             10000 non-null  object 
 1   Model            10000 non-null  object 
 2   Year             10000 non-null  int64  
 3   Engine Size (L)  10000 non-null  float64
 4   Fuel Type        10000 non-null  object 
 5   Price (USD)      10000 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 468.9+ KB


In [25]:
df.rename(columns={"Engine Size (L)": "Engine Size", "Price (USD)": "Price"}, inplace=True)
df.head()

Unnamed: 0,Make,Model,Year,Engine Size,Fuel Type,Price
0,Volkswagen,Jetta,2010,4.2,Petrol,54073.09
1,Honda,Pilot,2017,4.2,Hybrid,44924.91
2,Nissan,Murano,2011,4.2,Hybrid,76963.44
3,Toyota,RAV4,2010,2.4,Petrol,30871.25
4,Nissan,Altima,2010,3.6,Petrol,72037.65


In [26]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Price'])
y = df.Price

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, random_state=0)

cat_cols = [col for col in X_train_full.columns if X_valid_full[col].dtype == "object"]
num_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]

merge_cols = num_cols + cat_cols
X_train = X_train_full[merge_cols].copy()
X_valid = X_valid_full[merge_cols].copy()

In [27]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_trans = SimpleImputer(strategy='constant')
cat_trans = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder())
])

preprocessing = ColumnTransformer(transformers=[
    ('numeric_type', num_trans, num_cols),
    ('categorical_type', cat_trans, cat_cols)
])

In [28]:
from sklearn.ensemble import RandomForestRegressor

regressor_model = RandomForestRegressor(random_state=0, max_leaf_nodes=715)
model = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', regressor_model)
])

model.fit(X_train, y_train)

In [29]:
from sklearn.metrics import mean_absolute_error

pred = model.predict(X_valid)

mean_absolute_error(y_valid, pred)

16408.03434840532

In [30]:
data_real = pd.DataFrame({
    "Make": ['Hyundai'],
    "Model": ["CR-V"],
    "Year": [2022],
    "Engine Size": [4.1],
    "Fuel Type": ["Hybrid"]
})

model.predict(data_real)


array([45008.68852505])