In [1]:
from azureml.core import Workspace, Experiment, Datastore, Environment, ScriptRunConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.estimator import Estimator
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, choice, uniform, PrimaryMetricGoal

ws = Workspace.from_config()

In [2]:
# To use this file in azure, follow the tutorial in the research team documentation. 

# The cluster name should match the created computation cluster.
cluster_name = 'TCLR-tlu-cluster'

experiment_name = 'TCLR-cultural-4096-3'

# a way to pick which dataset the experiments are run on
data_name = 'cultural'

# The number of experiments to run. So there would be this number (max_runs) of hyperparameter sets 
max_runs = 10

# The length to use for Longformer
max_length = 4096

# How many training samples to use at a time. Affects the memory used in CUDA
max_memory_size = 1

# How many folds to use in k-fold cross validation
folds = 3

# If enabled, this simply runs a single quick experiment 
run_once = False

param_sampling = RandomParameterSampling( {
        '--batch_size': choice([8, 16, 32]),
        '--lr': uniform(1e-5, 1e-3),
        '--pretrained_lr': uniform(1e-6, 1e-4),
        '--weight_decay': uniform(0, 0.001),
        '--num_warmup_steps': choice(range(0, 100, 20)),
        '--dropout': uniform(0.0, 0.3)
    }
)

In [3]:
# Create an environment with the used packages
TCLR_env = Environment.get(workspace=ws, name="AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu")

# Specify docker steps as a string.
dockerfile = r'''
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu18.04:20211012.v1

ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/pytorch-1.9

# Create conda environment
RUN conda create -p $AZUREML_CONDA_ENVIRONMENT_PATH \
    python=3.7 \
    pip=20.2.4 \
    pytorch=1.9.0 \
    torchvision=0.10.0 \
    torchaudio=0.9.0 \
    cudatoolkit=11.1.1 \
    nvidia-apex=0.1.0 \
    transformers \
    scikit-learn \
    pandas \
    tqdm \
    matplotlib \
    -c anaconda -c pytorch -c conda-forge

# Prepend path to AzureML conda environment
ENV PATH $AZUREML_CONDA_ENVIRONMENT_PATH/bin:$PATH

# Install pip dependencies
RUN HOROVOD_WITH_PYTORCH=1 \
    pip install 'matplotlib>=3.3,<3.4' \
                'psutil>=5.8,<5.9' \
                'tqdm>=4.59,<4.60' \
                'pandas>=1.1,<1.2' \
                'scipy>=1.5,<1.6' \
                'numpy>=1.10,<1.20' \
                'ipykernel~=6.0' \
                'azureml-core==1.35.0' \
                'azureml-defaults==1.35.0' \
                'azureml-mlflow==1.35.0' \
                'azureml-telemetry==1.35.0' \
                'tensorboard==2.4.0' \
                'tensorflow-gpu==2.4.1' \
                'onnxruntime-gpu>=1.7,<1.8' \
                'horovod[pytorch]==0.21.3' \
                'future==0.17.1' \
                'imblearn' \
                'torch-tb-profiler==0.2.1'


# This is needed for mpi to locate libpython
ENV LD_LIBRARY_PATH $AZUREML_CONDA_ENVIRONMENT_PATH/lib:$LD_LIBRARY_PATH
'''
TCLR_env = TCLR_env.clone("TCLR_env")
TCLR_env.docker.base_dockerfile = dockerfile
TCLR_env.python.user_managed_dependencies=True

In [4]:
cluster=ComputeTarget(workspace=ws, name=cluster_name)

In [5]:
runargs = [
                          '--data_name', data_name, 
                      '--maxlen', max_length, 
                      '--max_memory_size', max_memory_size,
                      '--folds', folds]
if run_once:
    runargs.append('--once')
src = ScriptRunConfig(source_directory='.',
                      script='Longformer-CV.py',
                      arguments=runargs,
                      compute_target=cluster,
                      environment=TCLR_env)

In [6]:

exp = Experiment(workspace=ws, name=experiment_name)

In [7]:

early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10)

hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=param_sampling, 
                                     policy=early_termination_policy,
                                     primary_metric_name='accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=max_runs,
                                     max_concurrent_runs=2)

In [8]:
# start the HyperDrive run
hyperdrive_run = exp.submit(hyperdrive_config)