In [None]:
#@title Copyright 2025 Google LLC. { display-mode: "form" }
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table class="ee-notebook-buttons" align="left"><td>
<a target="_blank"  href="http://colab.research.google.com/github/google/earthengine-community/blob/master/guides/linked/Yggdrasil_decision_forests_earthengine_vertex_ai.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" /> Run in Google Colab</a>
</td><td>
<a target="_blank"  href="https://github.com/google/earthengine-community/blob/master/guides/linked/Yggdrasil_decision_forests_earthengine_vertex_ai.ipynb"><img width=32px src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" /> View source on GitHub</a></td></table>

# Using a YDF model in Earth Engine

[Yggdrasil Decision Forests(YDF)](https://ydf.readthedocs.io/en/latest/) is an implementation of popular tree-based machine learning models compatible with TensorFlow.  These models can be saved and hosted on Vertex AI, as with TensorFlow neural networks.  This notebook demonstrates how to use YDF to train a model, host the model on Vertex AI and get interactive predictions in Earth Engine.  The demonstration model produces a map of land cover from image data and pre-generated training data.

To get started, import the necessary libraries and authenticate.


#### **Warning!** This demo consumes billable resources of Google Cloud, including Earth Engine, Vertex AI and Cloud Storage.

## Setup

In [None]:
from google.colab import auth
import google

In [None]:
auth.authenticate_user()
credentials, project = google.auth.default()

In [None]:
MY_PROJECT = 'my-project'
MY_BUCKET = 'my-bucket'

In [None]:
import ee
ee.Initialize(credentials, project=MY_PROJECT)

In [None]:
import ydf  # Yggdrasil Decision Forests
import pandas as pd  # Use Pandas to load small datasets
import tensorflow as tf

## Training and input data

Grab the training data from [this Code Editor demo](https://code.earthengine.google.com/?scriptPath=Examples%3ADemos%2FClassification).

In [None]:
demo_labels = ee.FeatureCollection('projects/google/demo_landcover_labels')

In [None]:
NUM_CLASSES = 3
BANDS = ['B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B11', 'B12']

In [None]:
year = 2018

s2 = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
csPlus = ee.ImageCollection('GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED')
ROI = demo_labels.bounds(100)
QA_BAND = 'cs_cdf'
CLEAR_THRESHOLD = 0.60

composite = (s2
    .filterBounds(ROI)
    .filter(ee.Filter.calendarRange(year, year, 'year'))
    .linkCollection(csPlus, [QA_BAND])
    .map(lambda img: img.updateMask(img.select(QA_BAND).gte(CLEAR_THRESHOLD)))
    .median()
    .select(BANDS)
    .float())

# The name of the property on the points storing the class label.
class_property = 'landcover'

# Sample the composite to generate training data.  Note that the
# class label is stored in the 'landcover' property.
training = composite.sampleRegions(
  collection=demo_labels,
  properties=[class_property],
  scale=30
)

In [None]:
print(training.first().getInfo())

Export the training data.  Here we do it twice: once as CSV and once as TFRecord (shown below).  The CSV is loaded to a Pandas dataframe which is used the train the model.  The TFRecord is used to test the input and output of the trained model saved to TensorFlow saved model format, for hosting on Vertex AI.

In [None]:
desc = 'demo_landcover_labels_s2_training'
ee.batch.Export.table.toCloudStorage(
    collection=training, bucket=MY_BUCKET, description=desc, fileNamePrefix=desc, fileFormat='TFRecord'
).start()

In [None]:
train_ds = pd.read_csv(f'gs://{MY_BUCKET}/{desc}.csv')
train_ds = train_ds.filter(regex='B.*|landcover', axis=1)
train_ds = train_ds.astype({c: 'float32' for c in train_ds.columns if c != 'landcover'})
train_ds.head()

## Train a model

In [None]:
model = ydf.GradientBoostedTreesLearner(label='landcover', num_trees=10).train(train_ds)

In [None]:
model.describe()

## Save the model

Everything comes in and out of Earth Engine as `float32`.

In [None]:
INPUT_NAMES = composite.bandNames().getInfo()

# List of fixed-length features, all of which are float32.
columns = [
  tf.io.FixedLenFeature(shape=(), dtype=tf.float32) for k in INPUT_NAMES
]

# Dictionary with names as keys, features as values.
features_dict = dict(zip(INPUT_NAMES, columns))
features_dict

The YDF model is non-spatial.  We need to make the return type into shape `[batch, height, width, output_dimension]` where `height` and `width` are both one (a one-pixel neighborhood) and `output_dimension` is the number of classes in the model.

In [None]:
@tf.function
def post_processing(outputs):
  return tf.reshape(outputs, [-1, 1, 1, NUM_CLASSES])

Save the model to accept serialize TensorFlow example protos.  Here is where to specify the input format, output format, and any pre- or post-processing you might want to do.  See [this reference](https://ydf.readthedocs.io/en/latest/py_api/GenericModel/#ydf.GenericModel.to_tensorflow_function) for details.

In [None]:
model.to_tensorflow_saved_model(
    f'gs://{MY_BUCKET}/ydf_demo_tf_proto', mode='tf', feed_example_proto=True, feature_specs=features_dict, post_processing=post_processing)

## Test the saved model

Load the TFRecord datafile and send a batch of training data through it.  Ensure that the output is of shape `[batch, 1, 1, num_classes]`.

In [None]:
data_file = f'gs://{MY_BUCKET}/{desc}.tfrecord.gz'
ds = tf.data.TFRecordDataset(data_file, compression_type='GZIP')
batch = iter(ds.batch(4)).next()

In [None]:
m = tf.saved_model.load(f'gs://{MY_BUCKET}/ydf_demo_tf_proto')

In [None]:
m.signatures['serving_default'](batch)

## Host the model on Vertex AI

Note that optimized containers are not supported for YDF models.  Also note that, depending on your workload, you might need to specify more or bigger machines when you host the model.  See [this guide](https://ydf.readthedocs.io/en/latest/tutorial/tf_serving/) for more details on payload types and hosting YDF models on Vertex AI.

In [None]:
MODEL_NAME = 'ydf_demo_tf_proto'
REGION = 'us-central1'
CONTAINER_IMAGE='us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-15:latest'
ARTIFACT_URI = f'gs://{MY_BUCKET}/{MODEL_NAME}'
ENDPOINT_NAME = MODEL_NAME + '_endpoint'

In [None]:
!gcloud ai models upload \
  --project={PROJECT} \
  --region={REGION} \
  --artifact-uri={ARTIFACT_URI} \
  --container-image-uri={CONTAINER_IMAGE} \
  --description={MODEL_NAME} \
  --display-name={MODEL_NAME} \
  --model-id={MODEL_NAME}

In [None]:
!gcloud ai endpoints create \
  --display-name={ENDPOINT_NAME} \
  --region={REGION} \
  --project={PROJECT}

In [None]:
ENDPOINT_ID = !gcloud ai endpoints list \
  --project={PROJECT} \
  --region={REGION} \
  --filter=displayName:{ENDPOINT_NAME} \
  --format="value(ENDPOINT_ID.scope())"
ENDPOINT_ID = ENDPOINT_ID[-1]

In [None]:
print(ENDPOINT_ID)

In [None]:
!gcloud ai endpoints deploy-model {ENDPOINT_ID} \
  --project={PROJECT} \
  --region={REGION} \
  --model={MODEL_NAME} \
  --display-name={MODEL_NAME} \
  --machine-type=n1-highcpu-4 \
  --min-replica-count=2 \
  --max-replica-count=3

## Interactive inference

In [None]:
print('Prediction link:')
print(f'https://code.earthengine.google.com/28b903332503cab94694aeb24ff7dd84#project={PROJECT};endpoint={ENDPOINT_ID}foo;')

#### **Warning!** This demo consumes billable resources of Google Cloud, including Earth Engine, Vertex AI and Cloud Storage.  Be sure to shut down any prediction nodes to avoid ongoing charges.