In [1]:
from azureml.core import Workspace, Experiment

#ws = Workspace.get(name="udacity-project")
#exp = Experiment(workspace=ws, name="udacity-project")
#ws = Workspace.get(name="quick-starts-ws-148105",resource_group="quick-starts-ws-148105",subscription_id="81cefad3-d2c9-4f77-a466-99a7f541c7bb")
#print(ws)
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")


print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-148146
Azure region: southcentralus
Subscription id: cdbe0b43-92a0-4715-838a-f2648cc7ad21
Resource group: aml-quickstarts-148146


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
amlcomp_clus = "P1Cluster"
try:
    amlcomp = ComputeTarget(workspace=ws, name=amlcomp_clus)
except ComputeTargetException:
    comp_cfg = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    
    amlcomp = ComputeTarget.create(ws, amlcomp_clus, comp_cfg)

amlcomp.wait_for_completion(show_output=True)


Creating......
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os
from azureml.core.environment import Environment
from azureml.core import ScriptRunConfig  
# Specify parameter sampler
ps = RandomParameterSampling({
        'C': uniform(0.05, 1),
        'max_iter': choice(10, 50, 100, 150, 200, 250)
        }
)

# Specify a Policy
bandpolicy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

trainingscript_folder = "training"
if trainingscript_folder not in os.listdir():
    os.mkdir(f"./{trainingscript_folder}")

#This may be irrelevant
#https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-scikit-learn?view=azure-ml-py
#https://docs.microsoft.com/en-us/azure/machine-learning/how-to-migrate-from-estimators-to-scriptrunconfig?view=azure-ml-py
#TODO Figure out the source directory and arguments requirements
# Create a SKLearn estimator for use with train.py
#est = ### YOUR CODE HERE ###
myenv = Environment.get(workspace=ws, name="AzureML-Tutorial")


src = ScriptRunConfig(source_directory='.',
                      script='train.py',
                      compute_target=amlcomp,
                      environment=myenv)

#https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                             hyperparameter_sampling=ps,
                             policy=bandpolicy,
                             primary_metric_name="accuracy",
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs=50, ##Changed from 100
                             max_concurrent_runs=4)

In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
hdrive_run = exp.submit(hyperdrive_config)
RunDetails(hdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [5]:
import joblib
from azureml.core.model import Model
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
hdrive_run.wait_for_completion(show_output=True)
best_run = hdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

#print(hdrive_run.get_children_sorted_by_primary_metric())

print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['accuracy'])
print('\n --C - Inverse of regularization strength:',parameter_values[0])
print('\n --max_iter - Maximum number of iterations to converge:',parameter_values[1])
print('\n FileNames:', best_run.get_file_names())
print('----------------')
bestHdrivefolder="BestHdriveModel"
modelname = 'sklearn_hdrive_model'

os.makedirs(f'./{bestHdrivefolder}', exist_ok=True)
model = best_run.register_model(model_name='sklearn_hdrive',model_path='outputs/sklearn_hdrive_model.pkl')
print(model.name, model.id, model.version, sep='\t')
model.download(target_dir=f'./{bestHdrivefolder}/{modelname}.pkl', exist_ok = True)


RunId: HD_4965bb08-ab91-4c1d-8c56-0d94bd5bb999
Web View: https://ml.azure.com/runs/HD_4965bb08-ab91-4c1d-8c56-0d94bd5bb999?wsid=/subscriptions/cdbe0b43-92a0-4715-838a-f2648cc7ad21/resourcegroups/aml-quickstarts-148146/workspaces/quick-starts-ws-148146&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-06-29T04:51:07.229545][API][INFO]Experiment created<END>\n""<START>[2021-06-29T04:51:07.703952][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-06-29T04:51:07.876799][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-06-29T04:51:37.6294915Z][SCHEDULER][INFO]Scheduling job, id='HD_4965bb08-ab91-4c1d-8c56-0d94bd5bb999_1'<END><START>[2021-06-29T04:51:37.6281543Z][SCHEDULER][INFO]Scheduling job, id='HD_4965bb08-ab91-4c1d-8c56-0d94bd5bb999_0'<END><START>[2021-06-29T04:51:37.6307834Z][SCHEDULER][INFO]Scheduling job, id='HD_4965bb08-ab91-4

'BestHdriveModel/sklearn_hdrive_model.pkl/sklearn_hdrive_model.pkl'

In [6]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
data = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(data, separator=',', header=True, encoding='utf8')

In [7]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [13]:
print(x.columns)
print(x.head())
print(y.head())

Index(['age', 'marital', 'default', 'housing', 'loan', 'month', 'day_of_week',
       'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'contact_cellular', 'contact_telephone', 'education_basic.4y',
       'education_basic.6y', 'education_basic.9y', 'education_high.school',
       'education_illiterate', 'education_professional.course',
       'education_university.degree', 'education_unknown'],
      dtype='object')
   age  marital  default  housing  loan  month  day_of_week  duration  \
0   57        1        0        0     1      5            1       371   
1   55        1        0        1     0      5            4       285   
2   33        1        0        0

In [14]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.


automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=ds,
    label_column_name='y',
    n_cross_validations=3,
    compute_target=amlcomp,
    max_concurrent_iterations=4)

In [15]:
# Submit your automl run

### YOUR CODE HERE ###
autorun = exp.submit(config=automl_config, show_output=True)
RunDetails(autorun).show()

Submitting remote run.
No run_configuration provided, running on P1Cluster with default configuration
Running on remote compute: P1Cluster


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_65bf57ca-e49d-4eb8-80eb-26d0b9405213,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the sm

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [16]:
# Retrieve and save your best automl model.


#autorun.wait_for_completion(show_output=True)

best_run_auto, fittedmodel = autorun.get_output()
print(fittedmodel.steps)
model_name = best_run_auto.properties['model_name']
description = 'AutoML for P1'
tags = None
bestAutofolder="BestAutoModel"
modelnameauto = 'auto_model'
#model = best_run_auto.register_model(model_name=model_name, description = description, tags = tags)
print(model.name, model.id, model.version, sep='\t')
model.download(target_dir=f'./{bestAutofolder}/{modelnameauto}.pkl', exist_ok = True)




Package:azureml-automl-runtime, training version:1.31.0, current version:1.30.0
Package:azureml-core, training version:1.31.0, current version:1.30.0
Package:azureml-dataprep, training version:2.18.0, current version:2.15.1
Package:azureml-dataprep-native, training version:36.0.0, current version:33.0.0
Package:azureml-dataprep-rslex, training version:1.16.0, current version:1.13.0
Package:azureml-dataset-runtime, training version:1.31.0, current version:1.30.0
Package:azureml-defaults, training version:1.31.0, current version:1.30.0
Package:azureml-interpret, training version:1.31.0, current version:1.30.0
Package:azureml-mlflow, training version:1.31.0, current version:1.30.0
Package:azureml-pipeline-core, training version:1.31.0, current version:1.30.0
Package:azureml-telemetry, training version:1.31.0, current version:1.30.0
Package:azureml-train-automl-client, training version:1.31.0, current version:1.30.0
Package:azureml-train-automl-runtime, training version:1.31.0, current ver

[('datatransformer', DataTransformer(
    task='classification',
    is_onnx_compatible=False,
    enable_feature_sweeping=True,
    enable_dnn=False,
    force_text_dnn=False,
    feature_sweeping_timeout=86400,
    featurization_config=None,
    is_cross_validation=True,
    feature_sweeping_config={}
)), ('prefittedsoftvotingclassifier', PreFittedSoftVotingClassifier(
    estimators=[('64', Pipeline(memory=None,
             steps=[('sparsenormalizer', Normalizer(copy=True, norm='l2')),
                    ('xgboostclassifier',
                     XGBoostClassifier(booster='gbtree', colsample_bytree=0.7, eta=0.1, max_depth=4, max_leaves=0, n_estimators=100, n_jobs=1, objective='reg:logistic', problem_info=ProblemInfo(
        dataset_samples=32950,
        dataset_features=132,
        dataset_classes=2,
        dataset_num_categorica...
        iteration_timeout_param=None,
        feature_column_names=None,
        label_column_name=None,
        weight_column_name=None,
        

'BestAutoModel/auto_model.pkl/sklearn_hdrive_model.pkl'