In [None]:
import azureml.core
from azureml.core import Workspace, Dataset, Experiment, ContainerRegistry
from azureml.core.compute import AmlCompute
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.environment import Environment
from azureml.core.runconfig import RunConfiguration

from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep

import os

print("Using AzureML SDK version:", azureml.core.VERSION)

In [None]:
"""
## Use existing workspace
"""
ws = Workspace.from_config("../aml_workspace_config.json")

print("Using workspace: {} @ location: {}".format(ws.name, ws.location))

In [None]:
"""
##  Get The default Datastore
"""
default_datastore = ws.get_default_datastore()
print("Default Datastore: %s" % default_datastore.account_name)

In [None]:
"""
##  Use existing compute cluster
"""
compute_name = "GPU-NC24S" 
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("Found compute target. Using '{}' compute ".format( compute_name))
    else:
        print('Compute Target Not found. Create Manually.')

In [None]:
"""
##  Get Secret stuff
"""
keyvault = ws.get_default_keyvault()

registry_user = keyvault.get_secret("registry-user")
registry_password = keyvault.get_secret("registry-password")

registry_address = "5027c9a8fca54c36927f93253a076626.azurecr.io"

In [None]:
#curated_env_name = 'AzureML-pytorch-1.7-ubuntu18.04-py37-cuda11-gpu'
curated_env_name = 'AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu'
pytorch_env = Environment.get(workspace=ws, name=curated_env_name)

In [None]:
aml_run_config = RunConfiguration()

# `compute_name` as defined in "Azure Machine Learning compute" section above
aml_run_config.target = compute_name

registry = ContainerRegistry()
registry.address = registry_address
#this below was needed in cybersai workspace, must investigate why
registry.username = registry_user
registry.password = registry_password
aml_run_config.environment.docker.base_image_registry = registry
aml_run_config.framework = 'Python'
aml_run_config.environment.docker.enabled = True
#aml_run_config.environment.docker.base_image = "openmpi3.1.2-cuda10.0-cudnn7-ubuntu18.04" 
aml_run_config.environment.docker.base_image = "pytorch1.9-openmpi4.1.0-cuda11.1-cudnn8-ubuntu18.04"
#does this work? 
aml_run_config.environment.docker.shm_size  = "16g"
aml_run_config.environment.python.user_managed_dependencies = False
aml_run_config.environment.python.interpreter_path = 'python3'

# Add some packages relied on by data prep step
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    #conda_packages=['pytorch'], 
    #PyTorch Lightnining now needs to be exactly this, some breaking changes have been introduced. Distributed scoring?
                                                                                                                  #these come from reqiurements.txt from the run_mlm script
    pip_packages=['azureml-sdk', 'numpy', 'pandas', 'transformers','sklearn' , 'datasets','accelerate', 'sentencepiece', 'protobuf'], 
    pin_sdk_version=False)

In [None]:
# this could be explicitly named Datasets
#roberta_ps_train = Dataset.get_by_name(ws, name='roberta_ps_train')
#roberta_ps_test = Dataset.get_by_name(ws, name='roberta_ps_test')

# or not explicitly named Datasets, I like these better they're less work to maintain
datastore_paths = [(default_datastore, 'imdb/data/imdb_unsupervised.csv')]
#dataset = Dataset.Tabular.from_delimited_files(path=datastore_paths)
dataset = Dataset.File.from_files(path=datastore_paths)
dataset

dataset

In [None]:
## Create or Refer to an script folder on your local machine
script_folder = os.path.join(os.getcwd(), "..\src" )
print("script_folder",script_folder)

train_output = PipelineData("train_output", datastore=default_datastore)

train_consumption_conf = dataset.as_download()
test_consumption_conf = dataset.as_download()

# --model_type roberta will train from scratch
# --model_name_or_path roberta-base will use the roberta-base pre-train

#training from scratch
from_sratch_arguments = [ '--model_type', 'roberta',  '--train_file', train_consumption_conf, 
               '--validation_file', test_consumption_conf, '--do_train', '--do_eval', '--num_train_epochs', 5, '--save_steps', 10000, # comment this out later
               '--output_dir', train_output, '--tokenizer_name', 'tokenizers/imdb_tokenizer',
              '--max_seq_length', 512, '--per_device_train_batch_size', 4, '--fp16', True,
              '--config_overrides', 'max_position_embeddings=514']

# fine tuning
# https://huggingface.co/roberta-base 
fine_tune_arguments = [ '--model_name_or_path', 'roberta-base',  '--train_file', train_consumption_conf, 
               '--validation_file', test_consumption_conf, '--do_train', '--do_eval', '--num_train_epochs', 5, '--save_steps', 10000, # comment this out later
               '--output_dir', train_output, '--max_seq_length', 512, '--per_device_train_batch_size', 4, '--fp16', True]



In [None]:
train_mlm_step = PythonScriptStep(
    script_name="run_mlm.py",
    source_directory=script_folder,
    inputs=[train_consumption_conf, test_consumption_conf],
    outputs=[train_output],
    arguments=fine_tune_arguments,
    compute_target=compute_target,
    runconfig=aml_run_config,
    allow_reuse=False
)

In [None]:
# Build the pipeline
imdb_roberta_pipeline = Pipeline(workspace=ws, steps=[train_mlm_step])

In [None]:
# maybe rename to imdb_roberta_pipeline
experiment = Experiment(ws, 'imdb_roberta_pipeline')
pipeline_run = experiment.submit(imdb_roberta_pipeline)

#pipeline_run.set_tags({'ModelType':'RoBERTa from Scratch'})
pipeline_run.set_tags({'ModelType':'RoBERTa Base Fine-tuning'})
#pipeline_run.wait_for_completion()