In [None]:
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset

import logging

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

In [None]:
experiment_name = 'azure-nd-project-capstone-hyperdrive'
project_folder = './automl-run-capstone-project'

experiment = Experiment(ws, experiment_name)

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

cluster_name = "cluster-nd-capstone"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4, min_nodes=1)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count= None, timeout_in_minutes = 10)
# For a more detailed view of current AmlCompute status, use get_status().

In [None]:
from scripts.cleansing import clean_data
import pandas as pd

def get_cleaned_dataset():
    found = False
    ds_key = "openfoodfacts"
    description_text = "Data extracted from OpenFoodFacts open source database."

    if ds_key in ws.datasets.keys(): 
        found = True
        ds_cleaned = ws.datasets[ds_key] 

    # Otherwise, create it from the file
    if not found:
        #Reading a json lines file into a DataFrame
        data = pd.read_json('./eda/foods-features-v3.json', lines=True)
        # DataFrame with cleaned data
        data_cleaned = clean_data(data)
        exported_df = 'cleaned-openfoodfacts.parquet'
        cleaned_data.to_parquet(exported_df);
        # Register Dataset in Workspace using experimental funcionality to upload and register pandas dataframe at once
        ds_cleaned = TabularDatasetFactory.register_pandas_dataframe(dataframe=cleaned_data,
                                                                     target=(ws.get_default_datastore(), exported_df),
                                                                     name=ds_key, description=description_text,
                                                                     show_progress=True)
    return ds_cleaned

In [None]:

ds_cleaned = get_cleaned_dataset()


## Hyperdrive run for regression

In [None]:
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import NoTerminationPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import loguniform # supported by RandomParameterSampling
from azureml.train.hyperdrive.parameter_expressions import choice # supported by RandomParameterSampling
import os
import shutil

# Parameter sampler for the HyperDrive
ps = RandomParameterSampling(
    {
        '--learning_rate': uniform(0.01, 0.3),# Contribution of each tree: uniform discribution 
        '--n_estimators': choice(100, 150, 200, 250, 300, 350), # Maximum number of iterations to converge
    }
)

# ? No termination policy, since the job is not iterative (mertic is calculated only once)
# This policy compares the value (Y + Y * 0.2) to "best current score", and if smaller, cancels the run.
policy = BanditPolicy(slack_factor=0.01)

script_folder = './scripts'

# Create data consumption config for the Run, Dataset is consumed in 'direct' mode, 
dataset_consumption_cfg = ds_cleaned.as_named_input('dataset') 

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory=script_folder,
              entry_script='hyperdrive-train.py',
              inputs = [dataset_consumption_cfg],
              compute_target=compute_target)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=ps,
                                     policy=policy,
                                     estimator=est,
                                      #The name of the primary metric reported by the experiment runs.
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs = 20,
                                     max_duration_minutes=30,
                                     max_concurrent_runs=4) # 4 nodes
