# TensorFlow: Predict Fuel Efficiency Using Linear Regression

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

import tensorflow as tf

print("TF Version: ", tf.__version__)
print("TF Eager mode: ", tf.executing_eagerly())
print("TF GPU is", "available" if tf.config.list_physical_devices("GPU") else "not available")

# Prepare Dataset

## Load dataset

In [None]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

column_names = [
    "MPG",
    "Cylinders",
    "Displacement",
    "Horsepower",
    "Weight",
    "Acceleration",
    "Model Year",
    "Origin"
]

# Download and parse data in CSV format
raw_dataset = pd.read_csv(
    url,
    names=column_names,
    na_values='?',
    comment='\t',
    sep=' ',
    skipinitialspace=True)

In [None]:
dataset = raw_dataset.copy()
dataset.tail()

## Clean dataset

In [None]:
# Drop rows with missing values
dataset = dataset.dropna()

In [None]:
# Map categorical origin value to certain string
dataset["Origin"] = dataset["Origin"].map({1: "USA", 2: "Europe", 3: "Japan"})
dataset.tail()

In [None]:
# Convert origin string value into indicator columns (Origin -> Europe|Japan|USA with true/false value)
dataset = pd.get_dummies(
    dataset,
    columns=["Origin"],
    prefix="",
    prefix_sep="",
    dtype="float")
dataset.tail()

## Split dataset

In [None]:
train_ds = dataset.sample(frac=0.8, random_state=0)
test_ds = dataset.drop(train_ds.index)

In [None]:
train_ds.describe().transpose()

In [None]:
# Train features
train_features = train_ds.copy()
test_features = test_ds.copy()

# Target values
train_labels = train_features.pop("MPG")
test_labels = test_features.pop("MPG")

In [None]:
train_features.tail()

## Normalizers

In [None]:
# Create and adapt normalizer for all columns
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))

In [None]:
# Create and adapt normalizer for "Horsepower" column
horsepower_normalizer = tf.keras.layers.Normalization(axis=None)
horsepower_normalizer.adapt(np.array(train_features["Horsepower"]))

# Linear regression

In [None]:
def plot_loss(history):
  plt.plot(history.history["loss"], label="Loss")
  plt.plot(history.history["val_loss"], label="Validation Loss")
  plt.ylim([0, 10])
  plt.xlabel("Epoch")
  plt.ylabel("Error [MPG]")
  plt.legend()
  plt.grid(True)

## Linear regression with one variable

In [None]:
# Select horsepower feature
horsepower = train_features["Horsepower"]

In [None]:
horsepower_model = tf.keras.Sequential([
    horsepower_normalizer,
    tf.keras.layers.Dense(1),
])

In [None]:
horsepower_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss="mean_absolute_error")

In [None]:
%%time
history = horsepower_model.fit(
    train_features["Horsepower"],
    train_labels,
    epochs=100,
    verbose=0,
    validation_split = 0.2)

In [None]:
plot_loss(history)

In [None]:
result = horsepower_model.evaluate(
    test_features["Horsepower"],
    test_labels,
    verbose=0)

test_results = {
    "horsepower_model": result
}

In [None]:
x = tf.linspace(0.0, 250, 251)
y = horsepower_model.predict(x)

In [None]:
def plot_horsepower(x, y):
  plt.scatter(train_features["Horsepower"], train_labels, label="Data")
  plt.plot(x, y, color="k", label="Predictions")
  plt.xlabel("Horsepower")
  plt.ylabel("MPG")
  plt.legend()

In [None]:
plot_horsepower(x, y)

## Linear regression with multiple inputs

In [None]:
linear_model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(units=1)
])

In [None]:
linear_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

In [None]:
%%time
history = linear_model.fit(
    train_features,
    train_labels,
    epochs=100,
    verbose=0,
    validation_split = 0.2)

In [None]:
plot_loss(history)

In [None]:
result = linear_model.evaluate(
    test_features,
    test_labels,
    verbose=0)

test_results["linear_model"] = result

## Regression with DNN and a single input

In [None]:
dnn_horsepower_model = tf.keras.Sequential([
    horsepower_normalizer,
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1)
])

In [None]:
dnn_horsepower_model.compile(
    loss="mean_absolute_error",
    optimizer=tf.keras.optimizers.Adam(0.001))

In [None]:
%%time
history = dnn_horsepower_model.fit(
    train_features["Horsepower"],
    train_labels,
    validation_split=0.2,
    verbose=0,
    epochs=100)

In [None]:
plot_loss(history)

In [None]:
x = tf.linspace(0.0, 250, 251)
y = dnn_horsepower_model.predict(x)

In [None]:
plot_horsepower(x, y)

In [None]:
result = dnn_horsepower_model.evaluate(
    test_features["Horsepower"],
    test_labels,
    verbose=0)

test_results["dnn_horsepower_model"] = result

## Regression using a DNN and multiple inputs

In [None]:
dnn_model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1)
])

In [None]:
dnn_model.compile(
    loss="mean_absolute_error",
    optimizer=tf.keras.optimizers.Adam(0.001))

In [None]:
%%time
history = dnn_model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    verbose=0,
    epochs=100)

In [None]:
plot_loss(history)

In [None]:
result = dnn_model.evaluate(test_features, test_labels, verbose=0)

test_results["dnn_model"] = result

# Performance

In [None]:
pd.DataFrame(test_results, index=["Mean absolute error [MPG]"]).T

In [None]:
test_predictions = dnn_model.predict(test_features).flatten()

a = plt.axes(aspect="equal")
plt.scatter(test_labels, test_predictions)
plt.xlabel("True Values [MPG]")
plt.ylabel("Predictions [MPG]")
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel("Prediction Error [MPG]")
_ = plt.ylabel("Count")