In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [42]:
car_sales = pd.read_csv("./data/car-sales.csv").dropna(subset=["Price"])

x = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [44]:
imputer = ColumnTransformer(
    [
        (
            "imputer_const",
            SimpleImputer(strategy="constant", fill_value="N/A"),
            ["Make", "Colour"],
        ),
        (
            "imputer_mode",
            SimpleImputer(strategy="most_frequent"),
            ["Doors"],
        ),
        (
            "imputer_mean",
            SimpleImputer(strategy="mean"),
            ["Odometer (KM)"],
        ),
    ],
    remainder="passthrough",
)

encoder = ColumnTransformer(
    [
        (
            "encoder",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            ["make", "color", "doors"],
        ),
    ],
    remainder="passthrough",
)


def preprocess_x(x):
    x_imputed = pd.DataFrame(
        imputer.fit_transform(x),
        columns=["make", "color", "doors", "odometer"],
    )

    x_encoded = pd.DataFrame(encoder.fit_transform(x_imputed))

    return x_encoded


x_train = preprocess_x(x_train)
x_test = preprocess_x(x_test)

In [45]:
from sklearn.ensemble import RandomForestRegressor


model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.2986212058672991