# Car Sales Price Regression


In [None]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [None]:
df = pd.read_csv("./data/car-sales.csv")
df.rename(
    columns={
        "Make": "make",
        "Colour": "color",
        "Odometer (KM)": "odometer",
        "Doors": "doors",
        "Price": "price",
    },
    inplace=True,
)
df.dropna(subset=["price"], inplace=True)

## Option 1: train_test_split()


In [None]:
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
imputer = ColumnTransformer(
    [
        (
            "imputer_na",
            SimpleImputer(strategy="constant", fill_value="Unknown"),
            ["make", "color"],
        ),
        (
            "imputer_mode",
            SimpleImputer(strategy="constant", fill_value=X_train["doors"].mode()[0]),
            ["doors"],
        ),
        (
            "imputer_mean",
            SimpleImputer(strategy="constant", fill_value=X_train["odometer"].mean()),
            ["odometer"],
        ),
    ],
    remainder="passthrough",
)

encoder = ColumnTransformer(
    [
        (
            "encoder",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            ["make", "color", "doors"],
        ),
    ],
    remainder="passthrough",
)

In [None]:
X_train = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=["make", "color", "doors", "odometer"],
)

X_train = pd.DataFrame(
    encoder.fit_transform(X_train),
    columns=encoder.get_feature_names_out(X_train.columns),
)

X_test = pd.DataFrame(
    imputer.fit_transform(X_test),
    columns=["make", "color", "doors", "odometer"],
)

X_test = pd.DataFrame(
    encoder.fit_transform(X_test),
    columns=encoder.get_feature_names_out(X_test.columns),
)

In [None]:
model_forest = RandomForestRegressor()
model_forest.fit(X_train, y_train)
model_forest_score = model_forest.score(X_test, y_test)

model_lasso = Lasso()
model_lasso.fit(X_train, y_train)
model_lasso_score = model_lasso.score(X_test, y_test)

model_elastic = ElasticNet()
model_elastic.fit(X_train, y_train)
model_elastic_score = model_elastic.score(X_test, y_test)

print(f"RandomForest: {model_forest_score}")
print(f"Lasso: {model_lasso_score}")
print(f"ElasticNet: {model_elastic_score}")

## Option 2: cross_val_score()


In [None]:
X = df.drop("price", axis=1)
y = df["price"]

In [None]:
imputer = ColumnTransformer(
    [
        (
            "imputer_na",
            SimpleImputer(strategy="constant", fill_value="Unknown"),
            ["make", "color"],
        ),
        (
            "imputer_mode",
            SimpleImputer(strategy="most_frequent"),
            ["doors"],
        ),
        (
            "imputer_mean",
            SimpleImputer(strategy="mean"),
            ["odometer"],
        ),
    ],
    remainder="passthrough",
)

encoder = ColumnTransformer(
    [
        (
            "encoder",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            [0, 1, 2],
        ),
    ],
    remainder="passthrough",
)

In [None]:
model_lasso = Pipeline(
    [
        ("imputer", imputer),
        ("encoder", encoder),
        ("estimator", Lasso()),
    ]
)
model_lasso_scores = cross_val_score(model_lasso, X, y, cv=5)

model_forest = Pipeline(
    [
        ("imputer", imputer),
        ("encoder", encoder),
        ("estimator", RandomForestRegressor()),
    ]
)
model_forest_scores = cross_val_score(model_forest, X, y, cv=5)

model_elastic = Pipeline(
    [
        ("imputer", imputer),
        ("encoder", encoder),
        ("estimator", ElasticNet()),
    ]
)
model_elastic_scores = cross_val_score(model_elastic, X, y, cv=5)

print(f"RandomForest: {np.mean(model_forest_score)}")
print(f"Lasso: {np.mean(model_lasso_score)}")
print(f"ElasticNet: {np.mean(model_elastic_score)}")