# Audi Used Car Price Regression (Tensorflow)

In [None]:
import math

import matplotlib.pyplot as plt

import numpy as np

import pandas as pd

import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, RobustScaler, StandardScaler

import statsmodels.api as sm

import tensorflow as tf

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
data = pd.read_csv("../data/used-cars/audi.csv")

data = data.rename(
    {
        "model": "model",
        "year": "year",
        "price": "price",
        "transmission": "transmission",
        "mileage": "mileage",
        "model": "model",
        "fuelType": "fuel_type",
        "tax": "tax",
        "mpg": "mpg",
        "engineSize": "engine_size",
    },
    axis=1,
)

data = data.reindex(sorted(data.columns), axis=1)

In [None]:
print("--------------------------------------------------")
print("Sample")
print("--------------------------------------------------")

data.sample(10)

In [None]:
print("--------------------------------------------------")
print("Data Types")
print("--------------------------------------------------")

for label, content in data.items():
    print(f"{label}: {content.dtypes}")

In [None]:
print("--------------------------------------------------")
print("Missing Values")
print("--------------------------------------------------")

for label, content in data.items():
    print(f"{label}: {content.isna().sum()}")

In [None]:
print("--------------------------------------------------")
print("Descriptive Statistics")
print("--------------------------------------------------")

data.describe().transpose().round(3)

In [None]:
print("--------------------------------------------------")
print("Histograms")
print("--------------------------------------------------")

n_histograms = len(data.columns)
n_histogram_rows = math.ceil(n_histograms / 3)
fig, axes = plt.subplots(ncols=3, nrows=n_histogram_rows)
fig.set_size_inches(18, n_histogram_rows * 6)
axes = axes.flatten()

for index, column in enumerate(data.columns):
    ax = axes[index]
    ax.set_title(column)
    ax.set(xlabel=None, ylabel=None)
    sns.histplot(data[column], ax=ax)

In [None]:
print("--------------------------------------------------")
print("Feature Types")
print("--------------------------------------------------")

cat_features = ["fuel_type", "model", "transmission"]
num_features = ["engine_size", "mileage", "mpg", "tax", "year"]

print(f"Categorical Features: {cat_features}")
print(f"Numeric Features: {num_features}")

In [None]:
print("--------------------------------------------------")
print("Q-Q Plots")
print("--------------------------------------------------")

n_qqplots = len(data[num_features].columns)
n_qqplot_rows = math.ceil(n_qqplots / 3)
fig, axes = plt.subplots(ncols=3, nrows=n_qqplot_rows)
fig.set_size_inches(18, n_qqplot_rows * 6)
axes = axes.flatten()

for index, column in enumerate(data[num_features].columns):
    ax = axes[index]
    ax.set_title(column)
    ax.set(xlabel=None, ylabel=None)
    sm.qqplot(data[column], line="s", ax=ax)

In [None]:
print("--------------------------------------------------")
print("Pairwise Relationships")
print("--------------------------------------------------")

sns.pairplot(data[num_features], plot_kws={"s": 8})

In [None]:
X = data.drop(["price"], axis=1)
y = data["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

imputer = ColumnTransformer(
    [
        (
            "imputer_cat",
            SimpleImputer(strategy="most_frequent", add_indicator=True),
            cat_features,
        ),
        (
            "imputer_num",
            SimpleImputer(strategy="median", add_indicator=True),
            num_features,
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

standard_scaler = ColumnTransformer(
    [
        (
            "standard_scaler",
            StandardScaler(),
            num_features,
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

minmax_scaler = ColumnTransformer(
    [
        (
            "minmax_scaler",
            MinMaxScaler(feature_range=(0, 1)),
            num_features,
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

encoder = ColumnTransformer(
    [
        (
            "encoder_cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            cat_features,
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

preprocessing_pipeline = Pipeline(
    [
        ("imputer", imputer),
        ("standard_scaler", standard_scaler),
        ("minmax_scaler", minmax_scaler),
        ("encoder", encoder),
    ],
).set_output(transform="pandas")

preprocessing_pipeline.fit(X_train)

X_train = preprocessing_pipeline.transform(X_train)
X_test = preprocessing_pipeline.transform(X_test)

X_train = tf.convert_to_tensor(X_train)
y_train = tf.convert_to_tensor(y_train)
X_test = tf.convert_to_tensor(X_test)
y_test = tf.convert_to_tensor(y_test)

print("--------------------------------------------------")
print("Dataset Shapes")
print("--------------------------------------------------")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

In [None]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=(X_train.shape[1]), name="input"),
        tf.keras.layers.Dense(64, name="dense_1"),
        tf.keras.layers.Dense(128, name="dense_2"),
        tf.keras.layers.Dense(64, name="dense_3"),
        tf.keras.layers.Dense(32, name="dense_4"),
        tf.keras.layers.Dense(1, name="output"),
    ],
    name="model",
)

model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.MeanSquaredError()
metrics = tf.keras.metrics.MeanAbsoluteError()

model.compile(optimizer, loss, metrics)

In [None]:
model.fit(X_train, y_train, batch_size=32, epochs=32, shuffle=True, validation_split=0.2)

In [None]:
y_pred = model.predict(X_test)

y_min = min([np.amin(y_pred), np.amin(y_test)])
y_max = max([np.amax(y_pred), np.amax(y_test)])

fig, ax = plt.subplots()
fig.set_size_inches(8, 8)

ax.scatter(y_test, y_pred, color="blue", s=8)
ax.axline([y_min, y_min], [y_max, y_max], color="red")

ax.set_xlabel("True Value")
ax.set_ylabel("Predicted Value")
ax.set_xbound(y_min, y_max)
ax.set_ybound(y_min, y_max)