# Keras

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

from keras import Model
from keras import Input
from keras.layers import Dense
from keras.layers import Normalization
from keras.layers import Rescaling

from keras.optimizers import RMSprop

from keras.datasets import mnist

from keras.models import load_model

In [None]:
# If you are running on Google Colab, uncomment the next line before executing this code cell.

# ! pip install keras_tuner

import keras_tuner

In [None]:
rng = np.random.RandomState(2)

## Read in datasets and split them

In [None]:
import os
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
    base_dir = "."
dataset_dir = os.path.join(base_dir, "datasets")

In [None]:
df_housing = pd.read_csv(os.path.join(dataset_dir, "housing.csv"))

In [None]:
features = ["BasementArea", "GroundFloorArea", "Bedrooms", "Condition"]

X_housing = df_housing[features].values
y_housing = df_housing["SalePrice"].values

In [None]:
X_train_housing, X_test_housing, y_train_housing, y_test_housing = \
    train_test_split(X_housing, y_housing, test_size=0.2, random_state=rng)

In [None]:
df_cs1109 = pd.read_csv(os.path.join(dataset_dir, "cs1109.csv"))

In [None]:
features = ["lect", "lab"]

X_cs1109 = df_cs1109[features]

label_encoder = LabelEncoder()
y_cs1109 = label_encoder.fit_transform(df_cs1109["outcome"])

In [None]:
X_train_cs1109, X_test_cs1109, y_train_cs1109, y_test_cs1109 = \
    train_test_split(X_cs1109, y_cs1109, test_size=0.2, stratify=df_cs1109["outcome"], random_state=rng)

In [None]:
# Load the dataset (a dictionary) and get the features DataFrame and target values from the dictionary
iris = load_iris(as_frame=True)
iris_df = iris.data
iris_y = iris.target

# Split off the test set: 20% of the dataset.
train_iris_df, test_iris_df, train_iris_y, test_iris_y = \
    train_test_split(iris_df, iris_y, train_size=0.8, stratify=iris_y, random_state=rng)

# Convert to numpy arrays
X_train_iris = train_iris_df.values
y_train_iris = train_iris_y.values
X_test_iris = test_iris_df.values
y_test_iris = test_iris_y.values

## Regression on Housing Data

The Normalization layer standardizes (scales) the data.

In [None]:
inputs = Input(shape=(4,))
x = Normalization()(inputs)
x = Dense(units=16, activation="relu")(x)
x = Dense(units=8, activation="relu")(x)
outputs = Dense(units=1, activation="linear")(x)
housing_model = Model(inputs, outputs)

In [None]:
housing_model.compile(optimizer=RMSprop(learning_rate=0.001), loss="mse", metrics=["mae"])

In [None]:
housing_model.fit(X_train_housing, y_train_housing, epochs=40, batch_size=32, verbose=0)

In [None]:
test_loss, test_mae = housing_model.evaluate(X_test_housing, y_test_housing)
test_mae

We should edit the code and experiment: e.g. add or remove hidden layers, change the number of neurons in the hidden layers, change ReLU to sigmoid, change from RMSprop to another optimizer, change the learning rate, change the number of epochs, or change the batch size.

## Binary Classification on Student Data

In [None]:
inputs = Input(shape=(2,))
x = Normalization()(inputs)
x = Dense(units=16, activation="relu")(x)
x = Dense(units=8, activation="relu")(x)
outputs = Dense(units=1, activation="sigmoid")(x)
cs1109_model = Model(inputs, outputs)

In [None]:
cs1109_model.compile(optimizer=RMSprop(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
cs1109_model.fit(X_train_cs1109, y_train_cs1109, epochs=40, batch_size=32, verbose=0)

In [None]:
test_loss, test_acc = cs1109_model.evaluate(X_test_cs1109, y_test_cs1109)
test_acc

## Multiclass Classification on Irises

In [None]:
inputs = Input(shape=(4,))
x = Normalization()(inputs)
x = Dense(units=16, activation="relu")(x)
x = Dense(units=8, activation="relu")(x)
outputs = Dense(units=3, activation="softmax")(x)
iris_model = Model(inputs, outputs)

In [None]:
iris_model.compile(optimizer=RMSprop(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
iris_model.fit(X_train_iris, y_train_iris, epochs=40, batch_size=32, verbose=0)

In [None]:
test_loss, test_acc = iris_model.evaluate(X_test_iris, y_test_iris)
test_acc

Note the loss function above: sparse_categorical_crossentropy for multiclass classification when the classes are integers, e.g. 0 = one kind of Iris, 1 = another kind, 2 = a third kind (which is what we have in the Iris dataset).

Don't use categorical_cross_entropy. This is for when the *classes* have been one-hot encoded. This is not something we've been doing. Yes, we one-hot encoded the nominal-valued features - but not the classes.

## Model Selection

Oops! We went straight from training (`fit`) to error estimation on the test set (`evaluate`). How do we do validation sets in Keras?

The answer is that you can ask `fit` to split off some validation data. Let's illustrate on the housing data.

In [None]:
housing_history = housing_model.fit(X_train_housing, y_train_housing, validation_split=0.25, epochs=40, batch_size=32, verbose=0)

So, in this example, it will train on 60% of the full dataset and, at the end of every epoch, it will test on 20%. (We heldout the remaining 20% as the test set.). 

So this uses holdout to get the validation set. There is no option for using k-fold cross-validation. You could write your own. But the assumption is that you are using the kind of large datasest where holdout is appropriate. (Of course, that isn't really true for the housing, CS1109 and Iris datasets.)

`history` will be a dictionary that contains the loss at the end of every epoch and, in our case, the `mae` at the end of every epoch - or whatever `metrics` you asked for in `compile`. 

In [None]:
def plot_keras_history(history, metric):
    fig, axes = plt.subplots(1, 2, figsize=(6, 3))
    fig.tight_layout()
    axes[0].plot(history.history["loss"], label="train loss")
    axes[0].plot(history.history["val_loss"], label="val loss")
    axes[0].set_title("Loss")
    axes[0].legend()
    axes[1].plot(history.history[metric], label="train " + metric)
    axes[1].plot(history.history["val_" + metric], label="val " + metric)
    axes[1].set_title(metric)
    axes[1].legend()
    plt.show()

In [None]:
plot_keras_history(housing_history, "mae")

The plots help us detect problems.

Question. Look at the left-hand plot. Why is this bad news?

Once problems have been ironed out, you will be most interested in the final values:

In [None]:
train_err, val_err = housing_history.history["mae"][-1], housing_history.history["val_mae"][-1]
train_err, val_err

But what about using the validation set to help us to choose the values of hyperparameters?

A separate module (`keras_tuner`) makes this easier.

Here's a simple example. First, we must build the model and compile it within a function - but we specify which parts are hyperparameters and which values we would like to try:

In [None]:
def build_housing_model(hp):
    inputs = Input(shape=(4,))
    x = Normalization()(inputs)
    x = Dense(units=16, activation="relu")(x)
    x = Dense(hp.Choice("units", [2, 4, 8]), activation="relu")(x)
    outputs = Dense(units=1, activation="linear")(x)
    housing_model = Model(inputs, outputs)
    housing_model.compile(optimizer=RMSprop(learning_rate=0.001), loss="mse", metrics=["mae"])
    return housing_model

In [None]:
tuner = keras_tuner.GridSearch(
    build_housing_model,
    objective="val_mae",
    directory = base_dir,
    project_name="tuner_state",
    overwrite=True)

In [None]:
tuner.search(X_train_housing, y_train_housing, epochs=20, validation_split=0.25)

In [None]:
tuner.get_best_hyperparameters()[0].values

In [None]:
best_housing_model = tuner.get_best_models(num_models=1)[0]

In [None]:
best_housing_model.summary()

In [None]:
history = best_housing_model.fit(X_train_housing, y_train_housing, validation_split=0.25, epochs=40, batch_size=32, verbose=0)

Here's a second example in which there are more hyperparameters than in the previous example, and therefore we use a randomized seach instead of a grid-search.

In [None]:
def build_housing_model(hp):
    inputs = Input(shape=(4,))
    x = Normalization()(inputs)
    x = Dense(units=16, activation="relu")(x)
    hp_is_multilayered = hp.Boolean("is_multi_layered")
    if hp_is_multilayered:
        x = Dense(hp.Choice("units", [2, 4, 8]), activation="relu")(x)
    outputs = Dense(units=1, activation="linear")(x)
    housing_model = Model(inputs, outputs)
    housing_model.compile(optimizer=hp.Choice("optimizer", values =["sgd", "rmsprop", "adam", "nadam"]), 
                          loss="mse", metrics=["mae"])
    return housing_model

In [None]:
tuner = keras_tuner.RandomSearch(
    build_housing_model,
    objective="val_mae",
    max_trials=5,
    directory = base_dir,
    project_name="tuner_state",
    overwrite=True)

In [None]:
tuner.search(X_train_housing, y_train_housing, epochs=20, validation_split=0.25)

In [None]:
tuner.get_best_hyperparameters()[0].values

In [None]:
best_housing_model = tuner.get_best_models(num_models=1)[0]

In [None]:
best_housing_model.summary()

In [None]:
history = best_housing_model.fit(X_train_housing, y_train_housing, validation_split=0.25, epochs=40, batch_size=32, verbose=0)

## Multiclass Classification of Handwritten Digits

MNIST is a classic dataset containing images of hand-written digits.

Each example is a 28 pixel by 28 pixel grayscale image. The values are integers in [0, 255]. Each example is labelled to say what digit is contained in the image: 0-9. 

There are 70,000 images, so we can safely use holdout, and it is already partitioned: 60,000 training images; 10,000 test images.

In [None]:
# Keras has a utility function for downloading it into four Numpy arrays
# To get this to work on macOS, you might need to run something like this in a terminal:
# $ /Applications/Python\ 3.12/Install\ Certificates.command
# You may need to rpelace 3.12 by whatever version of Python you are using, e.g. 3.13
(X_train_mnist, y_train_mnist), (X_test_mnist, y_test_mnist) = mnist.load_data()

In [None]:
X_train_mnist.shape

In [None]:
X_train_mnist.dtype

In [None]:
np.unique(y_train_mnist)

In [None]:
X_test_mnist.shape

In [None]:
np.unique(y_test_mnist)

In [None]:
idx = 126 # Change this number to look at other images
some_example = X_train_mnist[idx]

In [None]:
# Look at the raw data for this image. Warning: large! (28 by 28)
some_example

In [None]:
# Draw it
some_example = some_example.reshape(28, 28)

fig = plt.figure(figsize=(2,2))
plt.imshow(some_example, cmap=plt.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()

In [None]:
# Take a look at its class
y_train_mnist[idx]

We need to reshape the data.

Our training data is in a 3D array of shape (60000, 28, 28). We change it to a 2D array of shape (60000, 28 * 28). Similarly, the test data.

This `flattens' the images. When working with images, it is often better not to do this. In a future lecture, we'll build neural networks that do not require us to flatten.

In [None]:
X_train_mnist = X_train_mnist.reshape((60000, 28 * 28))

X_test_mnist = X_test_mnist.reshape((10000, 28 * 28))

We'll do a two-layer network. One hidden layer with 512 neurons, using the ReLU activation function. The output layer will have 10 neurons, one per class, and will use the softmax activation function.

The features (pixel values) are all in the same range [0, 255], so we do not need to standardize using a Normalization layer. But it is a bad idea to feed into a neural network values that are much larger than the initial weights, so we will rescale to [0, 1] by dividing by 255. We can do this using a Rescaling layer.

In [None]:
inputs = Input(shape=(28 * 28,))
x = Rescaling(scale=1./255)(inputs)
x = Dense(units=512, activation="relu")(x)
outputs = Dense(units=10, activation="softmax")(x)
mnist_model = Model(inputs, outputs)

In [None]:
mnist_model.compile(optimizer=RMSprop(learning_rate=0.0001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
mnist_model.summary()

Do you understand all the numbers in the table above?

In [None]:
mnist_model.fit(X_train_mnist, y_train_mnist, epochs=10, batch_size=32)

In [None]:
test_loss, test_acc = mnist_model.evaluate(X_test_mnist, y_test_mnist)
test_acc

Having learned the weights, we should save the model (network, weights, training configuration, state of the optimizer) so that we don't have to learn them again!

In [None]:
mnist_model.save(os.path.join(base_dir, "models/my_model.keras"))

To reinstantiate a model, including compiling the model using the saved training configuration:

In [None]:
reloaded_model = load_model(os.path.join(base_dir, "models/my_model.keras"))