In [None]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-124780")
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Create compute cluster
cluster_name = "u-proj-cluster" # name between 2 and 16 symbols
try:
    # if cluster exists, get the reference to it
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_D2_V2', 
                                                           max_nodes=4)
    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

#  get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())


# Prepare data to register Datasets

In [None]:
# The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution.
# The classification goal is to predict if the client will subscribe a term deposit (variable y).
web_path ='https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'



## Register original dataset 

In [None]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Try to load the dataset from the Workspace first
found = False
ds_key = "bankmarketing-ds"
description_text = "Direct marketing campaigns of a Portuguese banking institution."

if ds_key in ws.datasets.keys(): 
    found = True
    ds = ws.datasets[ds_key] 

# Otherwise, create it from web csv file
if not found:
    # Create TabularDataset from web path
    ds = TabularDatasetFactory.from_delimited_files(path=web_path)
        
    #Register Dataset in the Workspace
    ds = ds.register(workspace=ws,
                     name=ds_key,
                     description=description_text)

## Register cleaned dataset

In [None]:
from train import clean_data

def get_cleaned_dataset():
    found = False
    ds_key = "bankmarketing-cleaned-ds"
    description_text = "Cleaned data of direct marketing campaigns of a Portuguese banking institution."

    if ds_key in ws.datasets.keys(): 
        found = True
        ds_cleaned = ws.datasets[ds_key] 

    # Otherwise, create it from the file
    if not found:
        cleaned_data = clean_data(ds.to_pandas_dataframe())
        exported_df = 'cleaned-df.parquet'
        cleaned_data.to_parquet(exported_df);
        # Register Dataset in Workspace using experimental funcionality to upload and register pandas dataframe at once
        ds_cleaned = TabularDatasetFactory.register_pandas_dataframe(dataframe=cleaned_data,
                                                                     target=(ws.get_default_datastore(), exported_df),
                                                                     name=ds_key, description=description_text,
                                                                     show_progress=True)
    return ds_cleaned


In [None]:
ds_cleaned = get_cleaned_dataset()

# HyperDrive run

In [None]:
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import NoTerminationPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import loguniform # supported by RandomParameterSampling
from azureml.train.hyperdrive.parameter_expressions import choice # supported by RandomParameterSampling
import os
import shutil

# Parameter sampler for the HyperDrive
ps = RandomParameterSampling(
    {
        '--C': loguniform(1, 6),# Inverse of regularization strength: uniform discribution exp(uniform(1, 6)) 
        '--max_iter': choice(100, 150, 200, 250, 300, 350), # Maximum number of iterations to converge
    }
)

# No termination policy, since the job is not iterative (mertic is calculated only once)
policy = NoTerminationPolicy()

script_folder = './training'
os.makedirs(script_folder, exist_ok=True)
    
# the training logic is in the train.py file.
shutil.copy('./train.py', script_folder)

# Create data consumption config for the Run, Dataset is consumed in 'direct' mode, 
dataset_consumption_cfg = ds_cleaned.as_named_input('dataset') 

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory=script_folder,
              entry_script='train.py',
              inputs = [dataset_consumption_cfg],
              compute_target=compute_target)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=ps,
                                     policy=policy,
                                     estimator=est,
                                      #The name of the primary metric reported by the experiment runs.
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs = 20,
                                     max_duration_minutes=30,
                                     max_concurrent_runs=4) # 4 nodes



In [None]:
from azureml.widgets import RunDetails
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()

In [None]:
# Waiting for the run to complete
hyperdrive_run.wait_for_completion(show_output=True)

In [None]:
# Get the best run
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

# Register the folder (and all files in it) as a model named 'best-model-hd' under the workspace,
# the name of the model 'model.joblib' is fixed in train.py
model_hd = best_run.register_model(model_name='best-model-hd', model_path='outputs/model.joblib')

#Save the model locally
model_hd.download(target_dir='./outputs', exist_ok=True)

# AutoML run

In [None]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Get registered cleaned dataset
ds_cleaned = get_cleaned_dataset()


In [None]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task= "classification",
    primary_metric="accuracy",
    training_data=ds_cleaned,
    #featurization='auto',
    label_column_name="y",
    n_cross_validations=5,
    debug_log = "automl_errors.log",
    enable_early_stopping= True,
    compute_target=compute_target,)

In [None]:
# Create new experiment for AutoML
exp_automl = Experiment(workspace=ws, name="automl-project")
# Submit AutoMLRun
auto_ml_run = exp_automl.submit(automl_config, show_output=True)

In [None]:
from pprint import pprint #pretty printer

"""Prints the pipeline step details of the model: details of each step and used estimators with their weights.
   https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-features#scaling-and-normalization
"""
def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] + ':' + e[1].steps[0][0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

In [None]:
# Retrieve and save your best automl model.

# If no input parameters are provided, get_output of AutoMLRun returns the best pipeline according to the primary metric
best_run, automl_model_pipeline = auto_ml_run.get_output()

# Register the folder (and all files in it) as a model named 'best-model-hd' under the workspace,
# the name of the model 'model.joblib' is fixed in train.py
model_automl = best_run.register_model(model_name='best-model-automl', model_path='outputs/model.pkl')

#Save the model locally
model_automl.download(target_dir='./outputs', exist_ok=True)

In [None]:
print_model(automl_model_pipeline)

# Clean Up

In [None]:
compute_target.delete()