In [None]:
# Import libraries. You may or may not use all of these.
!pip install -q git+https://github.com/tensorflow/docs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [None]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')
dataset.tail()

In [None]:
# prompt: Make sure to convert categorical data to numbers within the dataset

dataset['sex'] = pd.factorize(dataset['sex'])[0]
dataset['smoker'] = pd.factorize(dataset['smoker'])[0]
dataset['region'] = pd.factorize(dataset['region'])[0]
dataset.tail()


In [None]:
# prompt: Use 80% of the data as the train_dataset and 20% of the data as the test_dataset.

train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)


## Inspect the data
Review the joint distribution of a few pairs of columns from the training set.

In [None]:
sns.pairplot(train_dataset[['expenses', 'age', 'bmi']], diag_kind='kde')

Let's also check the overall statistics. Note how each feature covers a very different range:

In [None]:
train_dataset.describe().transpose()

## Split features from labels
Separate the target value—the "label"—from the features. This label is the value that you will train the model to predict.

In [None]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('expenses')
test_labels = test_features.pop('expenses')

## Normalization
In the table of statistics it's easy to see how different the ranges of each feature are:

In [None]:
train_dataset.describe().transpose()[['mean', 'std']]

It is good practice to normalize features that use different scales and ranges.

One reason this is important is because the features are multiplied by the model weights. So, the scale of the outputs and the scale of the gradients are affected by the scale of the inputs.

Although a model might converge without feature normalization, normalization makes training much more stable.

# The Normalization layer
The tf.keras.layers.Normalization is a clean and simple way to add feature normalization into your model.

The first step is to create the layer:

In [None]:
normalizer = tf.keras.layers.Normalization(axis=-1)

Then, fit the state of the preprocessing layer to the data by calling `Normalization.adapt`:

In [None]:
normalizer.adapt(np.array(train_features))

Calculate the mean and variance, and store them in the layer:

In [None]:
print(normalizer.mean.numpy())

When the layer is called, it returns the input data, with each feature independently normalized:

In [None]:
first = np.array(train_features[:1])

with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())

# Linear regression

Before building a deep neural network model, start with linear regression using one and several variables.

In [None]:
test_results = {}

## Linear regression with one variable
Begin with a single-variable linear regression to predict 'expenses' from 'bmi'.

Training a model with `tf.keras` typically starts by defining the model architecture. Use a `tf.keras.Sequential` model, which represents a sequence of steps.

There are two steps in your single-variable linear regression model:
- Normalize the 'bmi' input features using the tf.keras.layers.Normalization preprocessing layer.
- Apply a linear transformation (
) to produce 1 output using a linear layer (tf.keras.layers.Dense).

The number of inputs can either be set by the input_shape argument, or automatically when the model is run for the first time.

First, create a NumPy array made of the `bmi` features. Then, instantiate the `tf.keras.layers.Normalization` and fit its state to the `bmi` data:

In [None]:
from tensorflow.keras.layers import Input
bmi = np.array(train_features['bmi'])

bmi_normalizer = layers.Normalization(axis=None)
bmi_normalizer.adapt(bmi)

Build the Keras Sequential model:

In [None]:
bmi_model = tf.keras.Sequential([
    Input(shape=(1,)),
    bmi_normalizer,
    layers.Dense(units=1)
])

bmi_model.summary()

This model will predict 'expenses' from 'bmi'.

Run the untrained model on the first 10 'bmi' values. The output won't be good, but notice that it has the expected shape of (10, 1):

In [None]:
bmi_model.predict(bmi[:10])

Once the model is built, configure the training procedure using the Keras `Model.compile` method. The most important arguments to compile are the `loss` and the `optimizer`, since these define what will be optimized (`mean_absolute_error`) and how (using the `tf.keras.optimizers.Adam`).

In [None]:
bmi_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

Use Keras `Model.fit` to execute the training for 100 epochs:

In [None]:
%%time
history = bmi_model.fit(
    train_features['bmi'],
    train_labels,
    epochs=100,
    # Suppress logging.
    verbose=0,
    # Calculate validation results on 20% of the training data.
    validation_split = 0.2)

Visualize the model's training progress using the stats stored in the history object:

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
def plot_loss(history):
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (Mean Absolute Error)')
    plt.legend()
    plt.grid(True)
    plt.show()



In [None]:
plot_loss(history)

In [None]:
test_results['bmi_model'] = bmi_model.evaluate(
    test_features['bmi'],
    test_labels, verbose=0)

Since this is a single variable regression, it's easy to view the model's predictions as a function of the input:

In [None]:
x = tf.linspace(0.0, 250, 251)
y = bmi_model.predict(x)

In [None]:
def plot_bmi(x, y):
  plt.scatter(train_features['bmi'], train_labels, label='Data')
  plt.plot(x, y, color='k', label='Predictions')
  plt.xlabel('BMI')
  plt.ylabel('Expense')
  plt.legend()


In [None]:
plot_bmi(x, y)

## Linear regression with multiple inputs

ou can use an almost identical setup to make predictions based on multiple inputs. This model still does the same _y=mx+b_ calculation except that _m_ is a matrix and _x_ is a vector.

Create a two-step Keras Sequential model again with the first layer being `normalizer (tf.keras.layers.Normalization(axis=-1))` you defined earlier and adapted to the whole dataset:

In [None]:
linear_model = tf.keras.Sequential([
    normalizer,
    layers.Dense(units=1)
])

When you call Model.predict on a batch of inputs, it produces units=1 outputs for each example:

In [None]:
linear_model.predict(train_features[:10])

When you call the model, its weight matrices will be built—check that the kernel weights (the
 in
) have a shape of (9, 1):

In [None]:
linear_model.layers[1].kernel

Configure the model with Keras `Model.compile` and train with `Model.fit` for 100 epochs:

In [None]:
linear_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

In [None]:
%%time
history = linear_model.fit(
    train_features,
    train_labels,
    epochs=100,
    # Suppress logging.
    verbose=0,
    # Calculate validation results on 20% of the training data.
    validation_split = 0.2)

Using all the inputs in this regression model achieves a much lower training and validation error than the horsepower_model, which had one input:

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
def plot_loss(history):
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (Mean Absolute Error)')
    plt.legend()
    plt.grid(True)
    plt.show()



In [None]:
plot_loss(history)

Collect the results on the test set for later:

In [None]:
test_results['linear_model'] = linear_model.evaluate(
    test_features, test_labels, verbose=0)

## Regression with a deep neural network (DNN)
In the previous section, you implemented two linear models for single and multiple inputs.

Here, you will implement single-input and multiple-input DNN models.

The code is basically the same except the model is expanded to include some "hidden" non-linear layers. The name "hidden" here just means not directly connected to the inputs or outputs.

These models will contain a few more layers than the linear model:
- The normalization layer, as before (with `bmi_normalizer` for a single-input model and `normalizer` for a multiple-input model).
- Two hidden, non-linear, Dense layers with the ReLU (`relu`) activation function nonlinearity.
- A linear 1Dense1 single-output layer.

Both models will use the same training procedure, so the `ompile` method is included in the `build_and_compile_model` function below.

In [None]:
def build_and_compile_model(norm):
  model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1)
  ])

  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001)
                # metrics=['mae', 'mse']
                )
  return model

## Regression using a DNN and a single input
Create a DNN model with only 'BMI' as input and `bmi_normalizer` (defined earlier) as the normalization layer:

In [None]:
dnn_bmi_model = build_and_compile_model(bmi_normalizer)

This model has quite a few more trainable parameters than the linear models:

In [None]:
dnn_bmi_model.summary()

Train the model with Keras `Model.fit`:

In [None]:
%%time
history = dnn_bmi_model.fit(
    train_features['bmi'],
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)

This model does slightly better than the linear single-input bmi_model:

In [None]:
plot_loss(history)

If you plot the predictions as a function of 'BMI', you should notice how this model takes advantage of the nonlinearity provided by the hidden layers:

In [None]:
x = tf.linspace(0.0, 250, 251)
y = dnn_bmi_model.predict(x)

In [None]:
plot_bmi(x, y)

Collect the results on the test set for later:

In [None]:
test_results['dnn_bmi_model'] = dnn_bmi_model.evaluate(
    test_features['bmi'], test_labels,
    verbose=0)

## Regression using a DNN and multiple inputs
Repeat the previous process using all the inputs. The model's performance slightly improves on the validation dataset.

In [None]:
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

In [None]:
%%time
history = dnn_model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)

In [None]:
plot_loss(history)

Collect the results on the test set:

In [None]:
test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0)

## Performance
Since all models have been trained, you can review their test set performance:

In [None]:
pd.DataFrame(test_results, index=['Mean absolute error [expense]']).T

These results match the validation error observed during training.

## Make predictions
You can now make predictions with the dnn_model on the test set using Keras Model.predict and review the loss:

In [None]:
test_predictions = dnn_model.predict(test_features).flatten()
# test_predictions = linear_model.predict(test_features).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [expense]')
plt.ylabel('Predictions [expense]')
lims = [0, 10000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

Now, check the error distribution:

In [None]:
error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [expense]')
_ = plt.ylabel('Count')

In [None]:
# temp reset for the last cell
model = dnn_model
test_dataset = test_features

In [None]:
# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS.
# Test model by checking how well the model generalizes using the test set.
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Abs Error must be less than 3500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_dataset).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)
