In [35]:
!pip install azureml
!pip install azureml.core
!pip install azureml.widgets
!pip install azureml.pipeline

Collecting azureml.core
  Using cached azureml_core-1.23.0-py3-none-any.whl (2.1 MB)
[31mERROR: azureml-defaults 1.21.0 has requirement azureml-core~=1.21.0, but you'll have azureml-core 1.23.0 which is incompatible.[0m
[31mERROR: azureml-defaults 1.21.0 has requirement azureml-dataset-runtime[fuse]~=1.21.0, but you'll have azureml-dataset-runtime 1.23.0 which is incompatible.[0m
Installing collected packages: azureml.core
Successfully installed azureml.core
Collecting azureml.widgets
  Using cached azureml_widgets-1.23.0-py3-none-any.whl (14.1 MB)






Installing collected packages: azureml.widgets
Successfully installed azureml.widgets
Collecting azureml.pipeline
  Using cached azureml_pipeline-1.23.0-py3-none-any.whl (3.7 kB)


Installing collected packages: azureml.pipeline
Successfully installed azureml.pipeline


In [36]:
import os
from azureml.core import Workspace, Experiment, Environment, ScriptRunConfig, Dataset, Run
from azureml.widgets import RunDetails
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core.pipeline_output_dataset import PipelineOutputFileDataset

In [37]:
ws = Workspace.from_config()

In [38]:
keyvault = ws.get_default_keyvault()

In [39]:
default_datastore = ws.get_default_datastore()

In [40]:
dataset_who_vaccine_landscape_covid19_name = 'who-vaccine-landscape-covid19'
dataset_who_vaccine_landscape_covid19 = Dataset.get_by_name(workspace=ws, name=dataset_who_vaccine_landscape_covid19_name)

In [41]:
dataset_trials_landscape_name = 'trials-landscape'
dataset_trials_landscape = Dataset.get_by_name(workspace=ws, name=dataset_trials_landscape_name)

### Create CPU Compute

In [42]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# CPU cluster name and sku
aml_cpu_compute_cluster_name = 'cpucluster-d4-v3'
vm_size = 'STANDARD_D4_V3'

# Check cluster does not exist already
try:
    aml_cpu_compute = ComputeTarget(workspace=ws, name=aml_cpu_compute_cluster_name)
    print('Found existing cluster {}'.format(aml_cpu_compute_cluster_name))
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                           max_nodes=4)
    aml_cpu_compute = ComputeTarget.create(ws, aml_cpu_compute_cluster_name, compute_config)

aml_cpu_compute.wait_for_completion(show_output=True)

Found existing cluster cpucluster-d4-v3
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### Define Runtime Configuration

In [43]:
# https://docs.microsoft.com/en-us/azure/container-registry/container-registry-auth-service-principal

from azureml.core.runconfig import RunConfiguration
aml_run_config = RunConfiguration()

from azureml.core.environment import Environment
# Create the environment
rp_env = Environment(name='rp')
# Enable Docker and reference an image
rp_env.docker.enabled = True
rp_env.python.user_managed_dependencies=True

rp_env.docker.base_image_registry.address = "acieurfrcaassacr.azurecr.io"
rp_env.docker.base_image_registry.username = keyvault.get_secret(name='acieurfrcaassacr-admin-user')
rp_env.docker.base_image_registry.password = keyvault.get_secret(name='acieurfrcaassacr-admin-pwd')
rp_env.docker.base_image = "acieurfrcaassacr.azurecr.io/azureml-env-base-research-platform:latest"

aml_run_config.environment = rp_env
rp_env.register(workspace=ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "acieurfrcaassacr.azurecr.io/azureml-env-base-research-platform:latest",
        "baseImageRegistry": {
            "address": "acieurfrcaassacr.azurecr.io",
            "password": "AzureMlSecret=Env_796e80c7e5bf8a48cb603f229eeec578ec72443dbfe37710cc80de67228c6713_1#EnvironmentDefinition#ContainerRegistry#Password",
            "registryIdentity": null,
            "username": "acieurfrcaassacr"
        },
        "enabled": true,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "rp",
    

## Specify script source folder(s)

In [44]:
dataprep_script_folder = '../'
print('Source directory for data preparation is {}.'.format(os.path.realpath(dataprep_script_folder)))

Source directory for data preparation is /media/shared/My Documents/My Developments/research-platform/code/dataprep/clinical-trials/who-vaccine-landscape.


### Step - Extract WHO PDF to trials JSON

In [45]:
trials_out = PipelineOutputFileDataset(PipelineData(name="dataset_trials_landscape",datastore=default_datastore))

step_dataprep_extract = PythonScriptStep(
    name='Extract WHO to trials JSON',
    script_name='step_dataprep_extract.py', 
    arguments=['--input', dataset_who_vaccine_landscape_covid19.as_named_input('raw_who_data').as_mount(),
               '--output_dataset', 'trials-landscape',
               '--disease', 'COVID19'],
    outputs=[trials_out],
    compute_target=aml_cpu_compute,
    runconfig=aml_run_config,
    source_directory=dataprep_script_folder,
    allow_reuse=True
)

print('Step Dataprep Extract created')

Step Dataprep Extract created


### Create Step List

In [46]:
steps = [step_dataprep_extract]
print("Step lists created")

Step lists created


In [47]:
aml_pipeline = Pipeline(workspace=ws, steps=steps)
print ("Pipeline is built")

Pipeline is built


In [48]:
aml_pipeline.validate()
print("Pipeline validation complete")

Step Extract WHO to trials JSON is ready to be created [3cbe53fd]
Pipeline validation complete


In [49]:
# Note regenerate output, decides whether to execute the module to regenerate output at running time.
# It's by default unselected, which means if the module has been executed with the same parameters previously, 
# the system will reuse the output from last run to reduce run time.

aml_pipeline_run = Experiment(ws, 'pipeline-who-vaccine-landscape-covid19').submit(aml_pipeline, regenerate_outputs=True)
print("Experiment pipeline who-vaccine-landscape-covid19 is submitted for execution")

Created step Extract WHO to trials JSON [3cbe53fd][9d3fbbe5-a6d1-438c-b145-50ee9704d410], (This step will run and generate new outputs)
Submitted PipelineRun 5438ad05-126d-4e40-861c-a112988a598f
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/pipeline-who-vaccine-landscape-covid19/runs/5438ad05-126d-4e40-861c-a112988a598f?wsid=/subscriptions/19518d47-0c8b-4829-a602-c5ced78deb3f/resourcegroups/aci-eur-frc-aa-ss-rg/workspaces/aci-eur-frc-aa-ss-mlw
Experiment pipeline who-vaccine-landscape-covid19 is submitted for execution


In [50]:
RunDetails(aml_pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

In [51]:
step_runs = aml_pipeline_run.get_children()
for step_run in step_runs:
    status = step_run.get_status()
    print('Script:', step_run.name, 'status:', status)
    
    # Change this if you want to see details even if the Step has succeeded.
    if status == "Failed":
        joblog = step_run.get_job_log()
        print('job log:', joblog)