In [162]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC

In [163]:
df = pd.read_csv("./data/car-sales.csv")
df.rename(
    columns={
        "Make": "make",
        "Colour": "color",
        "Odometer (KM)": "odometer",
        "Doors": "doors",
        "Price": "price",
    },
    inplace=True,
)
df.dropna(subset=["make"], inplace=True)

In [164]:
X = df.drop("make", axis=1)
y = df["make"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [165]:
imputer_X = ColumnTransformer(
    [
        (
            "imputer_X_na",
            SimpleImputer(strategy="constant", fill_value="Unknown"),
            ["color"],
        ),
        (
            "imputer_X_mode",
            SimpleImputer(strategy="most_frequent"),
            ["doors"],
        ),
        (
            "imputer_X_mean",
            SimpleImputer(strategy="mean"),
            ["odometer", "price"],
        ),
    ],
    remainder="passthrough",
)

encoder_X = ColumnTransformer(
    [
        (
            "encoder_X",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            ["color", "doors"],
        ),
    ],
    remainder="passthrough",
)

In [166]:
X_train_imputed = imputer_X.fit_transform(X_train)
X_train = pd.DataFrame(X_train_imputed, columns=["color", "doors", "odometer", "price"])

X_train_encoded = encoder_X.fit_transform(X_train)
X_train_columns = encoder_X.get_feature_names_out(X_train.columns)
X_train = pd.DataFrame(X_train_encoded, columns=X_train_columns)

X_test_imputed = imputer_X.fit_transform(X_test)
X_test = pd.DataFrame(X_test_imputed, columns=["color", "doors", "odometer", "price"])

X_test_encoded = encoder_X.fit_transform(X_test)
X_test_columns = encoder_X.get_feature_names_out(X_test.columns)
X_test = pd.DataFrame(X_test_encoded, columns=X_test_columns)

In [167]:
model_forest = RandomForestClassifier()
model_forest.fit(X_train, y_train)
model_forest.score(X_test, y_test)

0.47643979057591623

In [168]:
model_svc = LinearSVC()
model_svc.fit(X_train, y_train)
model_svc.score(X_test, y_test)



0.38219895287958117

In [169]:
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)
model_knn.score(X_test, y_test)

0.3089005235602094