In [None]:
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# XManager codelab notebook

This notebook will take you through running an XManager experiment on Google Cloud Platform (GCP).

A stand-alone Jupyter Notebook can be created via GCP's [Vertex AI Notebooks](https://console.cloud.google.com/vertex-ai/notebooks/list/instances)

JupyterLab can be installed on your machine following [Jupyter's installation](https://jupyter.org/install).

## Install any prerequisites

1. Create a [GCP project](https://console.cloud.google.com/) if one does not already exist.

2. Install [Docker](https://docs.docker.com/engine/install/) if it is not already installed.

## Download and install XManager

In [None]:
!pip install xmanager

## Set default GCP values

The following gets the GCP project.

In [None]:
from google import auth
credentials = auth.default()[0]
project = auth.default()[1]
print('GCP Project:', project)

Use `gcloud auth application-default login` if the above command results in an error or the project is incorrect.

XManager requires a Google Cloud Storage Bucket. [Create one](https://cloud.google.com/storage/docs/creating-buckets) if one does not already exist and enter it in the box below.

In [1]:
from IPython.display import display
import ipywidgets
import os

def bucket_changed(change):
    os.environ['GOOGLE_CLOUD_BUCKET_NAME'] = change.new

GOOGLE_CLOUD_BUCKET_NAME = ipywidgets.Text(
    description='GOOGLE_CLOUD_BUCKET_NAME:',
    style={'description_width': 'initial'},
    layout=ipywidgets.Layout(width='50%'),
)
GOOGLE_CLOUD_BUCKET_NAME.observe(bucket_changed, names='value')

display(GOOGLE_CLOUD_BUCKET_NAME)

Text(value='', description='GOOGLE_CLOUD_BUCKET_NAME:', layout=Layout(width='50%'), style=DescriptionStyle(des…

In [None]:
from xmanager import xm
from xmanager import xm_local
# This code block sets FLAGS to use default values to avoid an absl.flags.UnparsedFlagAccessError.
# Normally XManager flags are set via the command-line with `xmanager train.py -- --key=value`
from absl import flags
flags.FLAGS([''])
flags.FLAGS.xm_wrap_late_bindings = True

## Launching an experiment

This code block imports dependencies used in later steps.

In [None]:
import itertools
import os

from xmanager import xm
from xmanager import xm_local

An experiment can be broken down into 5 steps:

1. Creating the experiment.
2. Defining the executable specification.
3. Defining the execution environment.
4. Creating the jobs.
5. Defining the hyperparameters.

### Creating the experiment

Give the experiment a name. The `create_experiment` method will also create a unique integer id for the experiment and save this experiment to a database.

In [None]:
async with xm_local.create_experiment(experiment_title='my-first-experiment') as experiment:
    print(f'Local Experiment created with experiment_id={experiment.experiment_id}')

### Defining the executable specification

Define the job that will run in the experiment. A `PythonContainer` is an example of a executable specificaiton. This executable specification tells XManager to package everything inside the `PythonContainer.path` as a container and use `PythonContainer.entrypoint` as the main module. Because we cloned XManager to `~/xmanager` in an early step, we can use one of the examples, `~/xmanager/examples/cifar10_torch` as the path.

We also need to declare where the executable should be staged. This step will upload the executable specification to the correct storage option that is best suited for the execution environment. For example, if the execution environment is Vertex AI, the executable must be stored in Google Container Registry. The `Vertex.Spec()` specification will upload the specification to Google Container Registry, where it will be accessible by Vertex AI.

In [None]:
[executable] = experiment.package([
    xm.python_container(
        executor_spec=xm_local.Vertex.Spec(),
        path=os.path.expanduser('~/xmanager/examples/cifar10_torch'),
        entrypoint=xm.ModuleName('cifar10'),
    )
])

### Defining the execution environment

Declare where the job will run and what compute requirements are necessary to run one job. To run on AI Vertex, we must use the `xm_local.Vertex` executor. Each job should use 1 NVidia T4 GPU, so we must pass in a `xm.JobRequirements` to the executor.

In [None]:
executor = xm_local.Vertex(xm.JobRequirements(T4=1))

### Launching the jobs

Finally, we can create an experiment and add experiment units to it. To add a single job to the experiment, create a `xm.Job` object that combine the executable, compute requirements, and custom arguments hyperparameters, and the job to the experiment.

In [None]:
async with xm_local.create_experiment(experiment_title='cifar10') as experiment:
    experiment.add(xm.Job(
        executable=executable,
        executor=executor,
        args={'batch_size': 64, 'learning_rate': 0.01},
    ))

#### Defining the hyperparameters

In research, it is often required to run the experimental setup multiple times with different hyperparameter values. This is called **hyperparameter optimization**. The simplest form of hyperparameter optimization is called *grid search* or *parameter sweep*, which is an exhaustive search through all possible Cartesian products of hyperparameter values. Grid search trials can be constructed using `itertools`.

In [None]:
inputs = {
    'batch_size': [64, 128],
    'learning_rate': [0.01, 0.001],
}
hyperparameters = list(dict(zip(inputs, x)) for x in itertools.product(*inputs.values()))

from pprint import pprint
pprint(hyperparameters)

To perform the grid search, loop over all the hyperparameters, passing a different hyperparameter configuration to the `args` parameter of each job. Add each job to the experiment.

In [None]:
async with xm_local.create_experiment(experiment_title='cifar10') as experiment:
    for hparams in trials:
        experiment.add(xm.Job(
            executable=executable,
            executor=executor,
            args=hparams,
        ))

### Tracking job status

You can list all of your previous experiments.

In [None]:
[e.experiment_id for e in xm_local.list_experiments()]

Some execution environments allow you to track the status of jobs in an experiment. Vertex AI is one of the execution environments that supports job-tracking.

In [None]:
# TODO: Use experiment.work_units instead of private member.
for i, unit in enumerate(experiment._experiment_units):
    print(f'[{i}] Completed: {unit.get_status().is_completed}, Failed: {unit.get_status().is_failed}')

## End to end

Combining everything above into a single code-block, the launch script looks like this:

In [None]:
async with xm_local.create_experiment(experiment_title='cifar10') as experiment:
    [executable] = experiment.package([
        xm.python_container(
            executor_spec=xm_local.Vertex.Spec(),
            path=os.path.expanduser('~/xmanager/examples/cifar10_torch'),
            entrypoint=xm.ModuleName('cifar10'),
        )
    ])

    batch_sizes = [64, 128]
    learning_rates = [0.01, 0.001]
    trials = list(
        dict([('batch_size', bs), ('learning_rate', lr)])
        for (bs, lr) in itertools.product(batch_sizes, learning_rates)
    )
    for hyperparameters in trials:
        experiment.add(xm.Job(
            executable=executable,
            executor=xm_local.Vertex(requirements=xm.JobRequirements(T4=1)),
            args=hyperparameters,
        ))