In [177]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [178]:
df = pd.read_csv("./data/car-sales.csv")
df.rename(
    columns={
        "Make": "make",
        "Colour": "color",
        "Odometer (KM)": "odometer",
        "Doors": "doors",
        "Price": "price",
    },
    inplace=True,
)
df.dropna(subset=["price"], inplace=True)

In [179]:
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [180]:
imputer = ColumnTransformer(
    [
        (
            "imputer_na",
            SimpleImputer(strategy="constant", fill_value="Unknown"),
            ["make", "color"],
        ),
        (
            "imputer_mode",
            SimpleImputer(strategy="most_frequent"),
            ["doors"],
        ),
        (
            "imputer_mean",
            SimpleImputer(strategy="mean"),
            ["odometer"],
        ),
    ],
    remainder="passthrough",
)

encoder = ColumnTransformer(
    [
        (
            "encoder",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            ["make", "color", "doors"],
        ),
    ],
    remainder="passthrough",
)

In [181]:
X_train_imputed = imputer.fit_transform(X_train)
X_train = pd.DataFrame(X_train_imputed, columns=["make", "color", "doors", "odometer"])

X_train_encoded = encoder.fit_transform(X_train)
X_train_columns = encoder.get_feature_names_out(X_train.columns)
X_train = pd.DataFrame(X_train_encoded, columns=X_train_columns)

X_test_imputed = imputer.fit_transform(X_test)
X_test = pd.DataFrame(X_test_imputed, columns=["make", "color", "doors", "odometer"])

X_test_encoded = encoder.fit_transform(X_test)
X_test_columns = encoder.get_feature_names_out(X_test.columns)
X_test = pd.DataFrame(X_test_encoded, columns=X_test_columns)

In [182]:
model_forest = RandomForestRegressor()
model_forest.fit(X_train, y_train)
model_forest.score(X_test, y_test)

0.22040513727719058

In [183]:
model_lasso = Lasso()
model_lasso.fit(X_train, y_train)
model_lasso.score(X_test, y_test)

0.25731849440969223

In [184]:
model_elastic = ElasticNet()
model_elastic.fit(X_train, y_train)
model_elastic.score(X_test, y_test)

0.1611186436578529