In [13]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC

In [14]:
df = pd.read_csv("./data/car-sales.csv")
df.rename(
    columns={
        "Make": "make",
        "Colour": "color",
        "Odometer (KM)": "odometer",
        "Doors": "doors",
        "Price": "price",
    },
    inplace=True,
)
df.dropna(subset=["make"], inplace=True)

In [15]:
X = df.drop("make", axis=1)
y = df["make"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
imputer = ColumnTransformer(
    [
        (
            "imputer_color",
            SimpleImputer(strategy="constant", fill_value="Unknown"),
            ["color"],
        ),
        (
            "imputer_doors",
            SimpleImputer(strategy="constant", fill_value=X_train["doors"].mode()[0]),
            ["doors"],
        ),
        (
            "imputer_odometer",
            SimpleImputer(strategy="constant", fill_value=X_train["odometer"].mean()),
            ["odometer"],
        ),
        (
            "imputer_price",
            SimpleImputer(strategy="constant", fill_value=X_train["price"].mean()),
            ["price"],
        ),
    ],
    remainder="passthrough",
)

encoder = ColumnTransformer(
    [
        (
            "encoder_X",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            ["color", "doors"],
        ),
    ],
    remainder="passthrough",
)

In [17]:
X_train = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=["color", "doors", "odometer", "price"],
)

X_train = pd.DataFrame(
    encoder.fit_transform(X_train),
    columns=encoder.get_feature_names_out(X_train.columns),
)

X_test = pd.DataFrame(
    imputer.fit_transform(X_test),
    columns=["color", "doors", "odometer", "price"],
)

X_test = pd.DataFrame(
    encoder.fit_transform(X_test),
    columns=encoder.get_feature_names_out(X_test.columns),
)

In [18]:
model_forest = RandomForestClassifier()
model_forest.fit(X_train, y_train)
model_forest_score = model_forest.score(X_test, y_test)

model_svc = LinearSVC()
model_svc.fit(X_train, y_train)
model_svc_score = model_svc.score(X_test, y_test)

model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)
model_knn_score = model_knn.score(X_test, y_test)

print(f"RandomForest: {model_forest_score}")
print(f"LinearSVC: {model_svc_score}")
print(f"KNearestNeighbors: {model_knn_score}")

RandomForest: 0.45549738219895286
LinearSVC: 0.3717277486910995
KNearestNeighbors: 0.3717277486910995


