In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI - Gemma distributed tuning with LoRA on TPUv5e, serving on L4 GPU

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/training/tpuv5e_gemma_peft_finetuning_and_serving.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"><br> Run in Colab (Will require the higher memory Colab pro)
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Ftraining%2Ftpuv5e_gemma_peft_finetuning_and_serving.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/training/tpuv5e_gemma_peft_finetuning_and_serving.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br>
Open in Vertex AI Workbench
    </a> (An e2-standard-8 CPU w/ 250GB disk is recommended)
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/training/tpuv5e_gemma_peft_finetuning_and_serving.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br>
      View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook is based on the [LoRA tuning example on ai.google.dev](https://ai.google.dev/gemma/docs/distributed_tuning). It follows an existing [Model Garden example written for fine-tuning on GPUs](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gemma_finetuning_on_vertex.ipynb), and has been modified to use the latest TPUv5e chips for training. It demonstrates fine-tuning and deploying Gemma models with [Vertex AI Custom Training Job](https://cloud.google.com/vertex-ai/docs/training/create-custom-job). A Vertex AI Custom Training Job allows for a higher level of customization and control over the fine-tuning job. All of the examples in this notebook use parameter efficient fine-tuning methods [PEFT](https://github.com/huggingface/peft) to reduce training and storage costs.

This notebook deploys the model with a [vLLM](https://github.com/vllm-project/vllm) docker


### Objective

- Fine-tune and deploy Gemma models with a Vertex AI Custom Training Job.
- Send prediction requests to your fine-tuned Gemma model.


### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

### Dataset

In this example, the IMDB reviews dataset from TensorFlow datasets is used to finetune the model. Details of the dataset can be found here: https://www.tensorflow.org/datasets/catalog/imdb_reviews

### Costs 

This tutorial uses billable components of Google Cloud:

Vertex AI (Training, TPUv5e, L4 GPU), Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), [Cloud NL API pricing](https://cloud.google.com/natural-language/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Get started

### Install Vertex AI SDK for Python and other required packages


In [2]:
import os

# (optional) update gcloud if needed
if os.getenv("IS_TESTING"):
    ! gcloud components update --quiet

! pip3 install --upgrade --quiet google-cloud-aiplatform

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [3]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [4]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Kaggle credentials
Gemma models are hosted by Kaggle. To use Gemma, request access on Kaggle:

* Sign in or register at [kaggle.com](https://www.kaggle.com)
* Open the [Gemma model card](https://www.kaggle.com/models/google/gemma) and select "Request Access"
* Complete the consent form and accept the terms and conditions

Then, to use the Kaggle API, create an API token:

* Open [Kaggle settings](https://www.kaggle.com/settings)
* Select "Create New Token"
* A kaggle.json file is downloaded. It contains your Kaggle credentials. Take note of the username and key since you'll use these later.

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

#### Location

You can also change the `LOCATION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

TPUv5e is available in the [following regions listed here](https://cloud.google.com/tpu/pricing)

In [5]:
LOCATION = "us-central1"  # @param {type: "string"}

In [6]:
PROJECT_ID = "ml-project-461521"  # @param {type:"string"}

### Import libraries

In [7]:
import os
from datetime import datetime, timedelta

from google.cloud import aiplatform

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [8]:
BUCKET_URI = f"gs://tpu-training-gemma-7b-{PROJECT_ID}-unique"  # @param {type:"string"}

#### Set folder paths for staging, environment, and model artifacts

In [9]:
STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "model")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = "190345877179-compute@developer.gserviceaccount.com"  # @param {type:"string"}

**If your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [22]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://tpu-training-gemma-7b-ml-project-461521-unique/...


### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project.

In [10]:
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=STAGING_BUCKET)

### Select the Gemma base model

In [11]:
# The Gemma base model.
base_model = "google/gemma-7b-it"  # @param ["google/gemma-2b", "google/gemma-2b-it", "google/gemma-7b", "google/gemma-7b-it"]

### Create the artifact registry repository and set the custom docker image uri

In [12]:
REPOSITORY = "tpuv5e-training-repository-unique-7b-it"

In [13]:
image_name_train = "gemma-lora-tuning-tpuv5e"
hostname = f"{LOCATION}-docker.pkg.dev"
tag = "latest"

In [27]:
# Register gcloud as a Docker credential helper
!gcloud auth configure-docker $LOCATION-docker.pkg.dev --quiet


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-west1-docker.pkg.dev": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


In [17]:
# One time or use an existing repository
!gcloud artifacts repositories create $REPOSITORY --repository-format=docker \
--location=$LOCATION --description="Vertex TPUv5e training repository"

[1;31mERROR:[0m (gcloud.artifacts.repositories.create) ALREADY_EXISTS: the repository already exists


In [18]:
# Define container image name
KERAS_TRAIN_DOCKER_URI = (
    f"{hostname}/{PROJECT_ID}/{REPOSITORY}/{image_name_train}:{tag}"
)

# Set the docker image uri for the vLLM serving container
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240220_0936_RC01"

# Set the docker image uri for the model conversion container that converts the fine-tuned model to HF format
KERAS_MODEL_CONVERSION_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/jax-keras-model-conversion:20240220_0936_RC01"

### Define common functions

In [19]:
def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model_vllm(
    model_name: str,
    model_uri: str,
    service_account: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    max_model_len: int = 8192,
    dtype: str = "bfloat16",
) -> tuple[aiplatform.Model, aiplatform.Endpoint]:
    # Upload the model to "Model Registry"
    job_name = get_job_name_with_datetime(model_name)
    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.95",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        "--disable-log-stats",
    ]
    model = aiplatform.Model.upload(
        display_name=job_name,
        artifact_uri=model_uri,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
    )

    # Deploy the model to an endpoint to serve "Online predictions"
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
        sync=False,
    )

    return model, endpoint

### Build the Docker container files

#### Create the trainer directory

In [20]:
import os

if not os.path.exists("trainer"):
    os.makedirs("trainer")

#### Kaggle credentials are required for KerasNLP training and Hex-LLM deployment with TPUs.
Set the KAGGLE_USERNAME AND KAGGLE_KEY to pass in as an environment variable for Vertex Training to use
Fenerate the Kaggle username and key by following [these instructions](https://github.com/Kaggle/kaggle-api?tab=readme-ov-file#api-credentials).
As mentioned earlier, you need to review and accept the model license.

In [21]:
KAGGLE_USERNAME = "denaumenko1"  # @param {type:"string", isTemplate:true}
KAGGLE_KEY = "7616f231472466443ca6a69f6a5a9012"  # @param {type:"string", isTemplate:true}

#### Create the Dockerfile for the custom container. This will install JAX[TPU], Keras, and TensorFlow datasets

In [22]:
%%writefile trainer/Dockerfile
# This Dockerfile fine tunes the Gemma model using LoRA with JAX

FROM python:3.10

ENV DEBIAN_FRONTEND=noninteractive

# Install basic libs
RUN apt-get update && apt-get -y upgrade && apt-get install -y --no-install-recommends \
        cmake \
        curl \
        wget \
        sudo \
        gnupg \
        libsm6 \
        libxext6 \
        libxrender-dev \
        lsb-release \
        ca-certificates \
        build-essential \
        git \
        libgl1

# Copy Apache license.
RUN wget https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/LICENSE

# Install required libs
RUN pip install --upgrade pip
RUN pip install jax[tpu]==0.4.25 -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
RUN pip install tensorflow==2.15.0.post1
RUN pip install tensorflow-datasets==4.9.4
RUN pip install -q -U keras-nlp==0.8.2
RUN pip install keras==3.0.5

# Copy other licenses.
RUN wget -O MIT_LICENSE https://github.com/pytest-dev/pytest/blob/main/LICENSE
RUN wget -O BSD_LICENSE https://github.com/pytorch/xla/blob/master/LICENSE
RUN wget -O BSD-3_LICENSE https://github.com/pytorch/pytorch/blob/main/LICENSE

ENV KERAS_BACKEND=jax
ENV XLA_PYTHON_CLIENT_MEM_FRACTION=0.9
ENV TPU_LIBRARY_PATH=/lib/libtpu.so

# Copy install libtpu to PATH above
RUN find ./usr/local/lib -name 'libtpu.so' -exec cp {} /lib \;

WORKDIR /
COPY train.py train.py
ENV PYTHONPATH ./

ENTRYPOINT ["python", "train.py"]

Overwriting trainer/Dockerfile


#### Add the __init__.py file

In [23]:
!touch trainer/__init__.py

#### Add the train.py file
This code is from the LoRA distributed fine-tuning code from this example: https://ai.google.dev/gemma/docs/distributed_tuning

The IMDB TensorFlow dataset is used to fine-tune the Gemma model. Additional logic is added to handle the TPU topology setting required by TPUv5e


In [24]:
%%writefile trainer/train.py
import os
import argparse
import shutil
import locale

# Переменные для сохранения модели и токенизатора
_ENCODING_FOR_MODEL_SAVING = "UTF-8"
_VOCABULARY_FILENAME = "vocabulary.spm"
_TOKENIZER_FILENAME = "tokenizer.model"

# Импорт библиотек Keras, TensorFlow, KerasNLP и TensorFlow Datasets
import keras
import keras_nlp
import tensorflow
import tensorflow_datasets as tfds
print (keras.__version__)
print (tensorflow.__version__)

# Параметры, передаваемые в скрипт
parser = argparse.ArgumentParser()
parser.add_argument(
    "--tpu_topology",
    help="Топология TPUv5e (например, 1x1, 2x2 и т.д.)",
    type=str
)
parser.add_argument(
    "--model_name",
    help="Имя модели Gemma (например, gemma_2b_en)",
    type=str
)
parser.add_argument(
    "--output_folder",
    type=str,
    required=True,
    help="Путь к директории, куда сохранить чекпоинт и токенизатор",
)
parser.add_argument(
    "--checkpoint_filename",
    type=str,
    default="fine_tuned.weights.h5",
    help="Имя файла для сохранения чекпоинта",
)
args = parser.parse_args()

def main():
    # Парсинг топологии TPU (например, 2x2)
    x = args.tpu_topology.split("x")
    tpu_topology_x = int(x[0])
    tpu_topology_y = int(x[1])
    print (f'TPU topology is ({tpu_topology_x}, {tpu_topology_y})')
    print (f'Model name is {args.model_name}')

    # Создание сетки устройств (mesh) для распределённого обучения
    device_mesh = keras.distribution.DeviceMesh(
        (tpu_topology_x, tpu_topology_y),
        ["batch", "model"],
        devices=keras.distribution.list_devices())

    model_dim = "model"

    # Настройка шардинга весов модели
    layout_map = keras.distribution.LayoutMap(device_mesh)
    layout_map["token_embedding/embeddings"] = (None, model_dim)
    layout_map["decoder_block.*attention.*(query|key|value).*kernel"] = (
        None, model_dim, None)
    layout_map["decoder_block.*attention_output.*kernel"] = (
        None, None, model_dim)
    layout_map["decoder_block.*ffw_gating.*kernel"] = (model_dim, None)
    layout_map["decoder_block.*ffw_linear.*kernel"] = (None, model_dim)

    # Применение стратегии параллелизма модели
    model_parallel = keras.distribution.ModelParallel(
        device_mesh, layout_map, batch_dim_name="batch")
    keras.distribution.set_distribution(model_parallel)

    # Загрузка модели Gemma по имени
    model_name = args.model_name
    gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(args.model_name)

    # Инференс до fine-tuning
    print (f'Running inference on the base {args.model_name} model')
    lm_output = gemma_lm.generate("Prompt: Return 3 things I ask for in this format. \
        Response: 1) item 1 2) item 2 3) item 3. \
        Prompt: List the 3 best comedy movies in the 90s Response: ", max_length=100)
    print (lm_output)

    # Загрузка датасета IMDB
    imdb_train = tfds.load(
        "imdb_reviews",
        split="train",
        as_supervised=True,
        batch_size=2,
    )
    # Удаление меток (оставляем только текст)
    imdb_train = imdb_train.map(lambda x, y: x)

    # Просмотр одного примера
    imdb_train.unbatch().take(1).get_single_element().numpy()

    # Включение LoRA в backbone модели
    gemma_lm.backbone.enable_lora(rank=4)

    # Ограничение длины последовательности
    gemma_lm.preprocessor.sequence_length = 128

    # Настройка оптимизатора AdamW
    optimizer = keras.optimizers.AdamW(
        learning_rate=5e-5,
        weight_decay=0.01,
    )
    # Исключение из weight decay: bias и scale
    optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

    # Компиляция модели с функцией потерь и метриками
    gemma_lm.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=optimizer,
        weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )
    gemma_lm.summary()

    # Обучение модели на 1 эпоху
    gemma_lm.fit(imdb_train, epochs=1)

    # Инференс после fine-tuning
    print (f'Running inference on the fine-tuned {args.model_name} model')
    lm_output = gemma_lm.generate("Prompt: Return 3 things I ask for in this format. \
        Response: 1) item 1 2) item 2 3) item 3. \
        Prompt: List the 3 best comedy movies in the 90s Response: ", max_length=100)
    print (lm_output) 

    # Сохранение чекпоинта и токенизатора
    print("Saving checkpoint and tokenizer.")
    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)
    locale.getpreferredencoding = lambda: _ENCODING_FOR_MODEL_SAVING
    gemma_lm.save_weights(
        os.path.join(args.output_folder, args.checkpoint_filename)
    )
    gemma_lm.preprocessor.tokenizer.save_assets(args.output_folder)

    # Копирование и переименование файла токенизатора
    print("Copying tokenizer file.")
    shutil.copy(
        os.path.join(args.output_folder, _VOCABULARY_FILENAME),
        os.path.join(args.output_folder, _TOKENIZER_FILENAME),
    )
    print ('Exiting job')

# Точка входа в скрипт
if __name__ == "__main__":
    main()

Overwriting trainer/train.py


## Fine-tune with Vertex AI Custom Training Jobs

This section demonstrates how to fine-tune and deploy Gemma models with PEFT LoRA on Vertex AI Custom Training Jobs. LoRA (Low-Rank Adaptation) is one approach of PEFT (Parameter Efficient Fine-tuning), where pretrained model weights are frozen and rank decomposition matrices representing the change in model weights are trained during fine-tuning. Read more about LoRA in the following publication: [Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L. and Chen, W., 2021. Lora: Low-rank adaptation of large language models. *arXiv preprint arXiv:2106.09685*](https://arxiv.org/abs/2106.09685).

#### Enable docker to run as a regular user

In [36]:
!sudo usermod -a -G docker ${USER}

#### Change to the trainer directory to build the docker container

In [37]:
%cd trainer

/home/jupyter/vertex-ai-samples/notebooks/official/training/trainer


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


#### Build the custom docker container and push to artifact registry

In [38]:
print(KERAS_TRAIN_DOCKER_URI)

us-central1-docker.pkg.dev/ml-project-461521/tpuv5e-training-repository-unique-7b-it/gemma-lora-tuning-tpuv5e:latest


In [47]:
!docker build -t $KERAS_TRAIN_DOCKER_URI -f Dockerfile .

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                          docker:default
[?25h[1A[0G[?25l[+] Building 0.2s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 1.48kB                                     0.0s
[0m => [internal] load metadata for docker.io/library/python:3.10             0.2s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 1.48kB                                     0.0s
[0m => [internal] load metadata for docker.io/library/python:3.10             0.3s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.4s (2/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile     

In [39]:
!docker push $KERAS_TRAIN_DOCKER_URI

The push refers to repository [us-central1-docker.pkg.dev/ml-project-461521/tpuv5e-training-repository-unique-7b-it/gemma-lora-tuning-tpuv5e]

[1B8f67f029: Preparing 
[1B37694c0f: Preparing 
[1Ba2369350: Preparing 
[1Bb63ba4ac: Preparing 
[1B73d2e4ea: Preparing 
[1B2407f1d4: Preparing 
[1Bf342cfa2: Preparing 
[1Bc9a2c33e: Preparing 
[1B6a54940c: Preparing 
[1Be1bf17a1: Preparing 
[1B6a682e1e: Preparing 
[1Bcb67400a: Preparing 
[1Bdd27167e: Preparing 
[1B408139bb: Preparing 
[1B51729d89: Preparing 
[1B95464665: Preparing 
[1B3e410343: Preparing 
[1B05ffff1c: Preparing 
[1Bd283e64a: Preparing 
[2Bd283e64a: Mounted from ml-project-461521/tpuv5e-training-repository-unique/gemma-lora-tuning-tpuv5e [17A[2K[20A[2K[19A[2K[14A[2K[15A[2K[13A[2K[10A[2K[12A[2K[11A[2K[9A[2K[8A[2K[7A[2K[5A[2K[4A[2K[1A[2K[2A[2Klatest: digest: sha256:1cb83be733f6c83b892f6559fef5b5156f3d359caf5faae948cc028e40c84f55 size: 4540


#### Change back to your home directory

In [40]:
%cd ..

/home/jupyter/vertex-ai-samples/notebooks/official/training


#### Set GCS folder locations and job configurations settings

In [37]:
# Create a GCS folder to store the merged model with the base model and the
# fine-tuned LORA adapter.
merged_model_dir = get_job_name_with_datetime("gemma-lora-model-tpuv5")
merged_model_output_dir = os.path.join(MODEL_BUCKET, merged_model_dir)
merged_model_output_dir_gcsfuse = merged_model_output_dir.replace("gs://", "/gcs/")

# Set the checkpoint output filename
checkpoint_filename = "fine_tuned.weights.h5"

DISPLAY_NAME_PREFIX = "gemma-lora-train"  # @param {type:"string"}
tpuv5e_gemma_peft_job = {
    "display_name": get_job_name_with_datetime(DISPLAY_NAME_PREFIX),
    "job_spec": {
        "worker_pool_specs": [
            {
                "machine_spec": {
                    "machine_type": "ct5lp-hightpu-4t",
                    "tpu_topology": "2x4",
                },
                "replica_count": 1,
                "container_spec": {
                    "image_uri": KERAS_TRAIN_DOCKER_URI,
                    "args": [
                        "--tpu_topology=2x4",
                        "--model_name=gemma_instruct_7b_en",
                        f"--output_folder={merged_model_output_dir_gcsfuse}",
                        f"--checkpoint_filename={checkpoint_filename}",
                    ],
                    "env": [
                        {"name": "KAGGLE_USERNAME", "value": KAGGLE_USERNAME},
                        {"name": "KAGGLE_KEY", "value": KAGGLE_KEY},
                    ],
                },
            },
        ],
    },
}

tpuv5e_gemma_peft_job

{'display_name': 'gemma-lora-train_20250718_203547',
 'job_spec': {'worker_pool_specs': [{'machine_spec': {'machine_type': 'ct5lp-hightpu-4t',
     'tpu_topology': '2x4'},
    'replica_count': 1,
    'container_spec': {'image_uri': 'us-central1-docker.pkg.dev/ml-project-461521/tpuv5e-training-repository-unique-7b-it/gemma-lora-tuning-tpuv5e:latest',
     'args': ['--tpu_topology=2x4',
      '--model_name=gemma_instruct_7b_en',
      '--output_folder=/gcs/tpu-training-gemma-7b-ml-project-461521-unique/model/gemma-lora-model-tpuv5_20250718_203547',
      '--checkpoint_filename=fine_tuned.weights.h5'],
     'env': [{'name': 'KAGGLE_USERNAME', 'value': 'denaumenko1'},
      {'name': 'KAGGLE_KEY', 'value': '7616f231472466443ca6a69f6a5a9012'}]}}]}}

#### Create job client and run job

In [38]:
job_client = aiplatform.gapic.JobServiceClient(
    client_options=dict(api_endpoint=f"{LOCATION}-aiplatform.googleapis.com")
)

In [41]:
create_tpuv5e_gemma_peft_job_response = job_client.create_custom_job(
    parent="projects/{project}/locations/{location}".format(
        project=PROJECT_ID, location=LOCATION
    ),
    custom_job=tpuv5e_gemma_peft_job,
)
print(create_tpuv5e_gemma_peft_job_response)

ResourceExhausted: 429 The following quota metrics exceed quota limits: aiplatform.googleapis.com/custom_model_training_tpu_v5e

#### Check on job progress
This may take 20-60 minutes or more depending on the model size. Run this cell multiple times to check progress

In [28]:
get_tpuv5e_gemma_peft_job_response = job_client.get_custom_job(
    name=create_tpuv5e_gemma_peft_job_response.name
)
get_tpuv5e_gemma_peft_job_response

name: "projects/190345877179/locations/us-central1/customJobs/8142851809568882688"
display_name: "gemma-lora-train_20250718_202105"
job_spec {
  worker_pool_specs {
    machine_spec {
      machine_type: "ct5lp-hightpu-4t"
      tpu_topology: "2x2"
    }
    replica_count: 1
    disk_spec {
      boot_disk_type: "pd-ssd"
      boot_disk_size_gb: 100
    }
    container_spec {
      image_uri: "us-central1-docker.pkg.dev/ml-project-461521/tpuv5e-training-repository-unique-7b-it/gemma-lora-tuning-tpuv5e:latest"
      args: "--tpu_topology=2x2"
      args: "--model_name=gemma_instruct_7b_en"
      args: "--output_folder=/gcs/tpu-training-gemma-7b-ml-project-461521-unique/model/gemma-lora-model-tpuv5_20250718_202105"
      args: "--checkpoint_filename=fine_tuned.weights.h5"
      env {
        name: "KAGGLE_USERNAME"
        value: "denaumenko1"
      }
      env {
        name: "KAGGLE_KEY"
        value: "7616f231472466443ca6a69f6a5a9012"
      }
    }
  }
}
state: JOB_STATE_PENDING
crea

#### Click on the console log url output from this cell to see your logs

In [29]:
from keras_nlp.models import GemmaCausalLM
print(GemmaCausalLM.presets.keys())

2025-07-18 20:21:22.603996: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752870082.835189    7109 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752870082.903301    7109 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752870083.454677    7109 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752870083.454726    7109 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752870083.454729    7109 computation_placer.cc:177] computation placer alr

dict_keys(['gemma_2b_en', 'gemma_instruct_2b_en', 'gemma_1.1_instruct_2b_en', 'code_gemma_1.1_2b_en', 'code_gemma_2b_en', 'gemma_7b_en', 'gemma_instruct_7b_en', 'gemma_1.1_instruct_7b_en', 'code_gemma_7b_en', 'code_gemma_instruct_7b_en', 'code_gemma_1.1_instruct_7b_en', 'gemma2_2b_en', 'gemma2_instruct_2b_en', 'gemma2_9b_en', 'gemma2_instruct_9b_en', 'gemma2_27b_en', 'gemma2_instruct_27b_en', 'shieldgemma_2b_en', 'shieldgemma_9b_en', 'shieldgemma_27b_en'])


In [30]:
job_id = create_tpuv5e_gemma_peft_job_response.name[
    create_tpuv5e_gemma_peft_job_response.name.rfind("/") + 1 :
]
startdate = datetime.today() - timedelta(days=1)
startdate = startdate.strftime("%Y-%m-%d")
print(
    f"https://console.cloud.google.com/logs/query;query=resource.labels.job_id=%22{job_id}%22%20timestamp%3E={startdate}"
)

https://console.cloud.google.com/logs/query;query=resource.labels.job_id=%228142851809568882688%22%20timestamp%3E=2025-07-17


### Convert the fine-tuned Keras checkpoint to HF format

#### Download the conversion script from KerasNLP tools
The GitHub repo is https://github.com/keras-team/keras-nlp

In [52]:
!wget -nv -nc https://raw.githubusercontent.com/keras-team/keras-nlp/master/tools/gemma/export_gemma_to_hf.py

#### Download the fine-tuned checkpoint files locally

In [54]:
print(merged_model_output_dir)

gs://tpu-training-gemma-7b-ml-project-461521-unique/model/gemma-lora-model-tpuv5_20250718_154802


In [53]:
!gcloud storage cp -r $merged_model_output_dir .

Copying gs://tpu-training-gemma-ml-project-461521-unique/model/gemma-lora-model-tpuv5_20250717_212334/fine_tuned.weights.h5 to file://./gemma-lora-model-tpuv5_20250717_212334/fine_tuned.weights.h5
Copying gs://tpu-training-gemma-ml-project-461521-unique/model/gemma-lora-model-tpuv5_20250717_212334/tokenizer.model to file://./gemma-lora-model-tpuv5_20250717_212334/tokenizer.model
Copying gs://tpu-training-gemma-ml-project-461521-unique/model/gemma-lora-model-tpuv5_20250717_212334/vocabulary.spm to file://./gemma-lora-model-tpuv5_20250717_212334/vocabulary.spm
  Completed files 3/3 | 9.4GiB/9.4GiB | 195.9MiB/s                             

Average throughput: 246.7MiB/s


#### Install libraries for model conversion

In [13]:
!pip install torch==2.1
!pip install --upgrade keras-nlp
!pip install --upgrade keras>=3
!pip install --upgrade accelerate sentencepiece transformers
!pip install numpy<2 --force-reinstall

Collecting torch==2.1
  Using cached torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting sympy (from torch==2.1)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.1)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.1)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.1)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu1

In [14]:
!pip install "numpy<2.0.0" --force-reinstall

Collecting numpy<2.0.0
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.3
    Uninstalling numpy-2.1.3:
      Successfully uninstalled numpy-2.1.3
  You can safely remove it manually.[0m[33m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.14.1 requires torch==1.13.1, but you have torch 2.1.0 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4


#### Run the model conversion script

In [18]:
!pip list | grep tensorflow
!nvcc --version
!ls -l /usr/local/ | grep cuda
!cat /usr/local/cuda/include/cudnn_version.h | grep CUDNN_MAJOR -A 2

tensorflow                               2.19.0
tensorflow-io-gcs-filesystem             0.37.1
tensorflow-text                          2.19.0
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
lrwxrwxrwx  1 root root   21 Jul  9 04:37 cuda -> /usr/local/cuda-11.8/
drwxr-xr-x 17 root root 4096 Jul  9 04:39 cuda-11.8
#define CUDNN_MAJOR 8
#define CUDNN_MINOR 9
#define CUDNN_PATCHLEVEL 0
--
#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)

/* cannot use constexpr here since this is a C-only file */


In [19]:
!sudo rm -rf /usr/local/cuda-11.8
!sudo rm -rf /usr/local/cuda


In [21]:
!wget https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run
!chmod +x cuda_12.2.0_535.54.03_linux.run
!sudo sh cuda_12.2.0_535.54.03_linux.run

--2025-07-18 13:01:26--  https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.45.46.200, 23.45.46.203
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|23.45.46.200|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4315928767 (4.0G) [application/octet-stream]
Saving to: ‘cuda_12.2.0_535.54.03_linux.run’


2025-07-18 13:03:43 (30.2 MB/s) - ‘cuda_12.2.0_535.54.03_linux.run’ saved [4315928767/4315928767]

[?1l>Signal caught, cleaning upqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq[23;3HDo you accept the above EULA? (accept/decline/quit):[2;4H[Hlqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqk[2;1Hx[78Cx[3;1Hx[78Cx[4;1Hx[78Cx[5;1Hx[78Cx[6;1Hx[78Cx[7;1Hx[78Cx[8;1Hx[78Cx[9;1Hx[78Cx[10;1Hx[78Cx[11;1Hx[78Cx[1

In [8]:
!echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc
!echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc
!source ~/.bashrc

In [17]:
os.environ["KERAS_BACKEND"] = "torch"
os.environ["KAGGLE_USERNAME"] = KAGGLE_USERNAME
os.environ["KAGGLE_KEY"] = KAGGLE_KEY
MODEL_SIZE="2b"
!KERAS_BACKEND=torch python export_gemma_to_hf.py \
  --weights_file ./gemma-lora-model-tpuv5_20250717_212334/fine_tuned.weights.h5 \
  --size 2b \
  --gemma_version 1 \
  --vocab_path ./gemma-lora-model-tpuv5_20250717_212334/vocabulary.spm \
  --output_dir ./gemma-lora-model-tpuv5_20250717_212334/fine_tuned_gg_hf


2025-07-18 14:35:48.720820: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752849349.116179    3760 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752849349.226170    3760 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752849350.207694    3760 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752849350.207741    3760 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752849350.207746    3760 computation_placer.cc:177] computation placer alr

#### Copy converted HF files to GCS

In [18]:
import os
merged_model_dir = 'gemma-lora-model-tpuv5_20250717_212334'
HUGGINGFACE_MODEL_DIR = os.path.join("./", merged_model_dir, "fine_tuned_gg_hf")
HUGGINGFACE_MODEL_DIR_GCS = os.path.join(merged_model_output_dir, "fine_tuned_gg_hf")
HUGGINGFACE_MODEL_DIR

'./gemma-lora-model-tpuv5_20250717_212334/fine_tuned_gg_hf'

In [19]:
!gcloud storage cp $HUGGINGFACE_MODEL_DIR/* $HUGGINGFACE_MODEL_DIR_GCS

Copying file://./gemma-lora-model-tpuv5_20250717_212334/fine_tuned_gg_hf/config.json to gs://tpu-training-gemma-ml-project-461521-unique/model/gemma-lora-model-tpuv5_20250718_143427/fine_tuned_gg_hf/config.json
Copying file://./gemma-lora-model-tpuv5_20250717_212334/fine_tuned_gg_hf/generation_config.json to gs://tpu-training-gemma-ml-project-461521-unique/model/gemma-lora-model-tpuv5_20250718_143427/fine_tuned_gg_hf/generation_config.json
uploading large objects. If you would like to opt-out and instead
perform a normal upload, run:
`gcloud config set storage/parallel_composite_upload_enabled False`
`gcloud config set storage/parallel_composite_upload_enabled True`
Note that with parallel composite uploads, your object might be
uploaded as a composite object
(https://cloud.google.com/storage/docs/composite-objects), which means
that any user who downloads your object will need to use crc32c
checksums to verify data integrity. gcloud storage is capable of
computing crc32c checksums, bu

### Deploy fine tuned models
This section uploads the model to Model Registry and deploys it on the Endpoint using [vLLM](https://github.com/vllm-project/vllm)

The model deployment step takes 15 minutes to 1 hour to complete, depending on the model sizes.

In [20]:
MODEL_NAME_VLLM = get_job_name_with_datetime(prefix="gemma-vllm-serve")
MODEL_SIZE="2b"
# Start with a G2 Series cost-effective configuration
if MODEL_SIZE == "2b":
    machine_type = "g2-standard-8"
    accelerator_type = "NVIDIA_L4"
    accelerator_count = 1
elif MODEL_SIZE == "7b":
    machine_type = "g2-standard-12"
    accelerator_type = "NVIDIA_L4"
    accelerator_count = 1
else:
    assert MODEL_SIZE in ("2b", "7b")

# See supported machine/GPU configurations in chosen region:
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute

# For even more performance, consider V100 and A100 GPUs
# > Nvidia Tesla V100
# machine_type = "n1-standard-8"
# accelerator_type = "NVIDIA_TESLA_V100"
# > Nvidia Tesla A100
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"

# Larger `max_model_len` values will require more GPU memory
max_model_len = 2048

model, endpoint = deploy_model_vllm(
    MODEL_NAME_VLLM,
    HUGGINGFACE_MODEL_DIR_GCS,
    SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    max_model_len=max_model_len,
)

Creating Model
Create Model backing LRO: projects/190345877179/locations/us-central1/models/70600741131124736/operations/6247096005808881664
Model created. Resource name: projects/190345877179/locations/us-central1/models/70600741131124736@1
To use this Model in another session:
model = aiplatform.Model('projects/190345877179/locations/us-central1/models/70600741131124736@1')
Creating Endpoint
Create Endpoint backing LRO: projects/190345877179/locations/us-central1/endpoints/1458248187058847744/operations/7685996086753755136
Endpoint created. Resource name: projects/190345877179/locations/us-central1/endpoints/1458248187058847744
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/190345877179/locations/us-central1/endpoints/1458248187058847744')
Deploying model to Endpoint : projects/190345877179/locations/us-central1/endpoints/1458248187058847744
Deploy Endpoint model backing LRO: projects/190345877179/locations/us-central1/endpoints/1458248187058847744/

#### Click on the console log url output from this cell to see your logs

In [22]:
print(model, endpoint)

<google.cloud.aiplatform.models.Model object at 0x7f004c7139d0> 
resource name: projects/190345877179/locations/us-central1/models/70600741131124736 <google.cloud.aiplatform.models.Endpoint object at 0x7f004c943e20> 
resource name: projects/190345877179/locations/us-central1/endpoints/1458248187058847744


In [21]:
startdate = datetime.today() - timedelta(days=1)
startdate = startdate.strftime("%Y-%m-%d")
log_link = "https://console.cloud.google.com/logs/query;query=resource.type=%22aiplatform.googleapis.com%2FEndpoint%22"
log_link += f"%20resource.labels.endpoint_id=%22{endpoint.name}%22"
log_link += f"%20resource.labels.location={LOCATION}"
log_link += f"%20timestamp%3E={startdate}"
print(log_link)

https://console.cloud.google.com/logs/query;query=resource.type=%22aiplatform.googleapis.com%2FEndpoint%22%20resource.labels.endpoint_id=%221458248187058847744%22%20resource.labels.location=us-central1%20timestamp%3E=2025-07-17


NOTE: The overall deployment can take 30-40 minutes or more. After the deployment succeeds (15-20 minutes or so), the fine-tuned model is downloaded from the GCS bucket used in training above. Thus, an additional ~15-20 minutes (depending on the model sizes) of waiting time is needed **after** the model deployment step above succeeds and before you run the next step below. Otherwise you might see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint.

### Send a prediction request

Once deployment succeeds, you can send requests to the endpoint with text prompts. Use the same example used earlier in the notebook

Example:

```
Prompt: Return 3 things I ask for in this format and do not repeat my prompt. Response: 1) item 1 2) item 2 3) item 3. List the 3 best comedy movies in the 90s Response:
Response:  1) The Cable Guy 2) Scooby-Doo 3) Beethoven Requirements
```

In [14]:
from google.cloud import aiplatform

aiplatform.init(
    project="190345877179",
    location="us-central1"
)

endpoint = aiplatform.Endpoint(
    endpoint_name="projects/190345877179/locations/us-central1/endpoints/1458248187058847744"
)

PROMPT = "Answer the following question clearly and seriously. Do not repeat or continue:Q: What is the theory of relativity? A:"

instances = [
    {
        "prompt": PROMPT,
        "max_tokens": 700,
        "temperature": 0.7,
        "top_p": 0.9,
        "top_k": 50,
        "stop": ["Q:", "Question:"]

    }
]

response = endpoint.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)


Prompt:
Answer the following question clearly and seriously. Do not repeat or continue:Q: What is the theory of relativity? A:
Output:
 According to the theory of relativity, the speed of light is constant in all inertial reference frames. This is an example of a universal law of nature. The theory of relativity is based on the idea that the speed of light is constant in all inertial reference frames. The speed of light is constant in all inertial reference frames. The speed of light is not constant in all inertial reference frames. The speed of light is not constant in all inertial reference frames.The theory of relativity is based on the idea that the speed of light is not constant in all inertial reference frames.


## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

In [None]:
# Delete the train job.
job_client.delete_custom_job(name=create_tpuv5e_gemma_peft_job_response.name)

# Undeploy model and delete endpoint.
endpoint.delete(force=True)

# Delete models.
model.delete()

import os

# Delete Cloud Storage objects that were created
delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $BUCKET_URI