## Install dependencies

In [1]:
!pip install -q keras-core

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/944.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/944.9 kB[0m [31m978.8 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m399.4/944.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m880.6/944.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m944.9/944.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h

## Set the backend

In this tutorial we will use Tensorflow as our backend, you can change it to "jax" or "torch" for JAX and PyTorch separately.

In [2]:
import os

# Try oicking a different value from "tensoroflow", "jax" and "torch".
os.environ["KERAS_BACKEND"] = "tensorflow"

In [3]:
import tensorflow as tf
import keras_core as keras

import numpy as np

Using TensorFlow backend


## Use a Keras layer

In [4]:
layer = keras.layers.Dense(
    2,
    activation="relu",
    bias_initializer="zeros",
)
data = np.random.uniform(size=[3, 3])
layer(data)

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[0.        , 0.61430275],
       [0.        , 0.3775722 ],
       [0.        , 0.2922499 ]], dtype=float32)>

## Write your custom layer

In [5]:
class Linear(keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.units = units

    def build(self, input_shape):
        input_dim = input_shape[-1]
        self.w = self.add_weight(
            shape=[input_dim, self.units], initializer="glorot_uniform",
        )
        self.b = self.add_weight(
            shape=[self.units,], initializer="zeros"
        )

    def call(self, inputs):
        return keras.ops.matmul(inputs, self.w) + self.b

layer = Linear(2)
layer(data)

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-0.7858173 , -0.73075306],
       [-0.3036967 , -0.6223322 ],
       [-0.07196832, -0.72267205]], dtype=float32)>

# Different ways of building the model

- Sequential API
- [**Preferred**] Functional API
- Subclassing model

In [6]:
# Sequential model
sequential_model = keras.Sequential([
    keras.Input([28, 28, 3]),
    keras.layers.Conv2D(8, 2),
    keras.layers.MaxPool2D(2),
    keras.layers.Flatten(),
    keras.layers.Dense(2),
    keras.layers.Softmax(),
])

print(sequential_model.summary())

None


In [7]:
# Functional model
inputs = keras.Input([28, 28, 3])
x = keras.layers.Conv2D(8, 2)(inputs)
x = keras.layers.MaxPool2D(2)(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(2)(x)
outputs = keras.layers.Softmax()(x)

functional_model = keras.Model(inputs=inputs, outputs=outputs)
print(functional_model.summary())

None


Let's build a complex model with functional api.

In [None]:
# Multi-input model.
input_1 = keras.Input([28, 28, 3])
input_2 = keras.Input([28, 28, 3])
x_1 = keras.layers.Conv2D(8, 2)(input_1)
x_2 = keras.layers.Conv2D(8, 2)(input_2)
x = keras.layers.MaxPool2D(2)(x_1 + x_2)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(2)(x)
outputs = keras.layers.Softmax()(x)

multi_input_functional_model = keras.Model(inputs=inputs, outputs=outputs)
print(multi_input_functional_model.summary())

None


In [8]:
# Subclassing `keras.Model`

class MyModel(keras.Model):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._conv = keras.layers.Conv2D(8, 2)
        self._max_pool = keras.layers.MaxPool2D(2)
        self._flatten = keras.layers.Flatten()
        self._dense = keras.layers.Dense(2)
        self._softmax = keras.layers.Softmax()

    def call(self, inputs):
        x = self._conv(inputs)
        x = self._max_pool(x)
        x = self._flatten(x)
        x = self._dense(x)
        return self._softmax(x)

subclass_model = MyModel()
subclass_model(np.random.uniform(size=[1, 28, 28, 3]))
subclass_model.summary()

# Models are callable.

`keras.Model` instances are also callable. You can call models as if it is a function, no matter which way you used to build the model.

In [None]:
sample_data = np.random.uniform(size=[1, 28, 28, 3])

print(sequential_model(sample_data))
print(functional_model(sample_data))
print(subclass_model(sample_data))

[[0.72899467 0.27100533]]
[[0.18588641 0.8141136 ]]
[[0.8943892 0.1056108]]


## Models can be sliced

In [None]:
sliced_model = keras.Model(
    inputs=functional_model.inputs,
    outputs=functional_model.get_layer("conv2d_1").output,
)
sliced_model.summary()

# Train the model

Now we have defined the model. For actual training, we need the following pieces:
- Data
- Loss/Target function
- Optimizer

In [9]:
# Prepare data for a 2-class classification.
data = np.random.uniform(size=[20, 28, 28, 3])
label = np.random.randint(2, size=20)

In [10]:
# Let's use functional model defined above.
model = functional_model

In [11]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(0.001),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

In [12]:
model.fit(
    data,
    label,
    batch_size=5,
    epochs=2,
)

Epoch 1/2
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 0.6664 - sparse_categorical_accuracy: 0.6133
Epoch 2/2
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6117 - sparse_categorical_accuracy: 0.7800 


<keras_core.src.callbacks.history.History at 0x7c0f499438b0>

# [Reading Material] Eager mode and graph mode (demo with TF syntax)

Debug in eager mode, put actual job in graph mode. Read more [here](https://www.tensorflow.org/guide/basics#graphs_and_tffunction).

In [None]:
# Eager mode
print(model(data[0:1, ...]))

# Graph mode
print(model.predict(data[0:1, ...]))

tf.Tensor([[0.03376885 0.9662311 ]], shape=(1, 2), dtype=float32)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 360ms/step
[[0.03376885 0.9662311 ]]


In [None]:
@tf.function(jit_compile=True)
def forward_pass(x):
    outputs = model(x)
    print(outputs)
    return outputs

print(forward_pass(data[0:1, ...]))

Tensor("functional_2_1/softmax_1_1/Softmax:0", shape=(1, 2), dtype=float32)
tf.Tensor([[0.03376885 0.9662311 ]], shape=(1, 2), dtype=float32)


In [None]:
# You can also run `fit`, `predict` in eager mode.
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(0.001),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
    run_eagerly=True
)

# Run `predict` in eager mode by setting `run_eagerly` above.
print(model.predict(data[0:1, ...]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[[0.03376885 0.9662311 ]]


# [Reading Material] Write your custom training loop (demo with TF syntax)

We encourage using `fit` for training/finetuning as much as possible, but it's still important to write you custom training loop. For example, if you want to debug the training loop, or have a better control over each step, you can use custom training loop.

In [13]:
train_data = tf.data.Dataset.from_tensor_slices((data, label))
train_data = train_data.batch(5).cache().prefetch(tf.data.AUTOTUNE)

In [14]:
loss_fn = keras.losses.SparseCategoricalCrossentropy()
optimizer = keras.optimizers.Adam(0.001)
metric = keras.metrics.SparseCategoricalAccuracy()

In [15]:
@tf.function
def train_step(data):
    x, y = data
    with tf.GradientTape() as tape:
        outputs = model(x)
        loss = loss_fn(y, outputs)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    metric.update_state(y, outputs)

    return loss


for i, data in enumerate(train_data):
    loss = train_step(data)
    print(f"Batch {i}, loss: {loss:.3f}, accuracy: {metric.result():.3f}")

Batch 0, loss: 0.593, accuracy: 0.600
Batch 1, loss: 0.633, accuracy: 0.600
Batch 2, loss: 0.884, accuracy: 0.467
Batch 3, loss: 0.686, accuracy: 0.500


# [Reading Material] Save your model

In [None]:
# Save the weights.
model.save_weights("dummy.weights.h5")

In [None]:
sequential_model.load_weights("dummy.weights.h5")

In [None]:
tf.reduce_sum(sequential_model.get_layer("conv2d").kernel - model.get_layer("conv2d_1").kernel)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

# Instruction Finetuning a Generative Model

We will show how to finetune a GPT2 model with Dolly dataset. We use GPT2 for demonstration purpose, users can switch it to other KerasNLP models.

In [16]:
!pip install -q keras-nlp datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m573.5/573.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [17]:
import keras_nlp

gpt2_causal_lm = keras_nlp.models.GPT2CausalLM.from_preset("gpt2_base_en")
gpt2_causal_lm.generate("I think basketball is fun because", max_length=50)

Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/vocab.json
[1m1042301/1042301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/merges.txt
[1m456318/456318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step       
Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/model.h5
[1m497986112/497986112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 0us/step


'I think basketball is fun because it has a lot of great players. The game can have a lot of great players because of the variety of different things you can do to get a shot at them.\n\nThe game is fun because it has'

In [18]:
# Load Dolly2 dataset from HuggingFace
from datasets import load_dataset

dolly = load_dataset("databricks/databricks-dolly-15k", split="train")

Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Format data with a prompt template.

In [19]:
prompt_template = {
    "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n{response}",
    "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n{response}",
}

In [20]:
def format_text(data):
    if len(data["context"]) == 0:
        full_text = prompt_template["prompt_no_input"].format(
            instruction=data["instruction"],
            input=data["context"],
            response=data["response"],
        )
    else:
        full_text = prompt_template["prompt_input"].format(
            instruction=data["instruction"],
            input=data["context"],
            response=data["response"],
        )

    return full_text

In [21]:
dolly_text = []
for data in dolly:
    dolly_text.append(format_text(data))

In [22]:
dolly_tf_data = tf.data.Dataset.from_tensor_slices(dolly_text)
dolly_tf_data = dolly_tf_data.batch(4).cache().prefetch(tf.data.AUTOTUNE)

Take a look at the dataset record.

In [31]:
sample = next(iter(dolly_tf_data))
print(sample)

tf.Tensor(
[b"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhen did Virgin Australia start operating?\n\n### Input:\nVirgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.\n\n### Response:\nVirgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route."
 b'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n

Take a preview of what the processed training data looks like.

In [32]:
gpt2_causal_lm.preprocessor(sample)

({'token_ids': <tf.Tensor: shape=(4, 1024), dtype=int32, numpy=
  array([[50256, 21106,   318, ...,     0,     0,     0],
         [50256, 21106,   318, ...,     0,     0,     0],
         [50256, 21106,   318, ...,     0,     0,     0],
         [50256, 21106,   318, ...,     0,     0,     0]], dtype=int32)>,
  'padding_mask': <tf.Tensor: shape=(4, 1024), dtype=bool, numpy=
  array([[ True,  True,  True, ..., False, False, False],
         [ True,  True,  True, ..., False, False, False],
         [ True,  True,  True, ..., False, False, False],
         [ True,  True,  True, ..., False, False, False]])>},
 <tf.Tensor: shape=(4, 1024), dtype=int32, numpy=
 array([[21106,   318,   281, ...,     0,     0,     0],
        [21106,   318,   281, ...,     0,     0,     0],
        [21106,   318,   281, ...,     0,     0,     0],
        [21106,   318,   281, ...,     0,     0,     0]], dtype=int32)>,
 <tf.Tensor: shape=(4, 1024), dtype=bool, numpy=
 array([[ True,  True,  True, ..., False, F

In [34]:
num_epochs = 2
lr = keras.optimizers.schedules.PolynomialDecay(
    5e-5,
    decay_steps=dolly_tf_data.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
gpt2_causal_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.AdamW(lr, weight_decay=0.001),
    metrics=[keras.metrics.SparseCategoricalAccuracy(),]
)

# Use MLflow to track your experiments

MLflow is a handy tool for managing the lifecycle of your ML experiments. Here we show how to use it for experiment tracking purposes.

If you have not, please register an account of [Databricks community edition](https://www.databricks.com/try-databricks#account). It should take no longer than 1min to register. Databricks CE (community edition) is a free platform for users to try out Databricks features. For this guide, we need the ML experiment dashboard for us to track our training progress.

In [24]:
!pip install -q git+https://github.com/mlflow/mlflow.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.5/83.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.5/189.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.1/148.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.2/80.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

After you have sucessfully registered an account, all you need to do is to run the command below to connect from Google Colab to your Databricks account. You will need to enter following information at prompt:
- **Databricks Host**: https://community.cloud.databricks.com/
- **Username**: your signed up email
- **Password**: your password

In [27]:
!databricks configure

Databricks Host (should begin with https://): https://community.cloud.databricks.com/
Username: qianchen94era@gmail.com
Password: 
Repeat for confirmation: 


In [35]:
import mlflow

mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/2023-ghc-keras-demo")

with mlflow.start_run() as run:
    mlflow_callback = mlflow.keras_core.MLflowCallback(
        run,
        log_every_epoch=False,
        log_every_n_steps=20,
    )
    gpt2_causal_lm.fit(
        dolly_tf_data.take(100),
        epochs=num_epochs,
        callbacks=[mlflow_callback],
    )

Epoch 1/2
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 1s/step - loss: 0.5456 - sparse_categorical_accuracy: 0.0932
Epoch 2/2
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 1s/step - loss: 0.4396 - sparse_categorical_accuracy: 0.1072


In [None]:
gpt2_causal_lm.generate("I think basketball is fun because", max_length=50)

"I think basketball is fun because it's fun, but it's also hard to learn from. You can't really learn how to play basketball from the outside. You have to learn how to play the game. You have to learn how to play"