In [1]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
import pandas as pd

ws = Workspace.from_config()
ws.write_config(path='.azureml')
exp = Experiment(workspace=ws, name="udacity-project")

subscription_id = '417646f6-643f-4fcd-96b8-77997bfdfac3'
resource_group = 'kgovind-ml-wksp'
workspace_name = 'kgovindmlwk'

workspace = Workspace(subscription_id, resource_group, workspace_name)

run = exp.start_logging()

In [8]:
output = {}
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Experiment Name'] = exp.name
pd.set_option('display.max_colwidth', -1)
outputDF = pd.DataFrame(data=output,index=[''])
outputDF.T

cpu_cluster_name = 'govindk-gpunode'

#verify cluster does not already exsist
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print ('Found Exsisting Cluster')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',
    max_nodes=50)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)

Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.


InProgress......
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [9]:
from azureml.core.environment import CondaDependencies, Environment

myenv = Environment(name="govindk")
conda_dep = CondaDependencies()
myenv = Environment.from_pip_requirements('govindk', '/anaconda/envs/azureml_py38/lib/python3.8/site-packages/azureml/automl/core/validated_linux_requirements.txt')
conda_dep.add_conda_package("scikit-learn")

In [22]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import loguniform, choice
import os
import shutil

ps = RandomParameterSampling(
    {
        "--C": loguniform(0.001, 1.5),
        "--max_iter": choice(50, 100, 150)
    }
)
policy = BanditPolicy(
    evaluation_interval=1,
    slack_factor= 0.1
)

if "training" not in os.listdir():
    os.mkdir("./training")
    
script_folder = './training'
os.makedirs(script_folder, exist_ok=True)
shutil.copy('./train.py', script_folder)

sk_estimator = SKLearn(
    source_directory= script_folder,
    compute_target= compute_target,
    entry_script= "train.py",
    vm_size="STANDARD_NC6",
    vm_priority="dedicated"
)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=sk_estimator,
    hyperparameter_sampling= ps,
    policy= policy,
    primary_metric_name= "Accuracy",
    primary_metric_goal= PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=30,
    max_concurrent_runs=10
)



In [23]:

# Submit your hyperdrive run to the experiment and show run details with the widget.
hd_run=exp.submit(config=hyperdrive_config)
RunDetails(hd_run).show()
hd_run.wait_for_completion(show_output= True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_49f2944d-bd0e-4727-bfbf-8dd86037f0e8
Web View: https://ml.azure.com/runs/HD_49f2944d-bd0e-4727-bfbf-8dd86037f0e8?wsid=/subscriptions/417646f6-643f-4fcd-96b8-77997bfdfac3/resourcegroups/kgovind-ml-wksp/workspaces/kgovindmlwk&tid=2d8cc149-4c57-462b-a1fb-6e548ffd73bd

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-08-13T15:44:05.271571][API][INFO]Experiment created<END>\n""<START>[2021-08-13T15:44:05.697068][GENERATOR][INFO]Trying to sample '10' jobs from the hyperparameter space<END>\n""<START>[2021-08-13T15:44:05.923836][GENERATOR][INFO]Successfully sampled '10' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_49f2944d-bd0e-4727-bfbf-8dd86037f0e8
Web View: https://ml.azure.com/runs/HD_49f2944d-bd0e-4727-bfbf-8dd86037f0e8?wsid=/subscriptions/417646f6-643f-4fcd-96b8-77997bfdfac3/resourcegroups/kgovind-ml-wksp/workspaces/kgovindmlwk&tid=2d8cc149-4c57-462b-a1fb-6e548ffd73bd



{'runId': 'HD_49f2944d-bd0e-4727-bfbf-8dd86037f0e8',
 'target': 'govindk-gpunode',
 'status': 'Completed',
 'startTimeUtc': '2021-08-13T15:44:04.860132Z',
 'endTimeUtc': '2021-08-13T15:54:41.557093Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '4e1c54cb-7266-4e5a-80a1-b5a3a124118e',
  'user_agent': 'python/3.8.1 (Linux-5.4.0-1055-azure-x86_64-with-glibc2.10) msrest/0.6.21 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.32.0',
  'score': '0.9109256449165403',
  'best_child_run_id': 'HD_49f2944d-bd0e-4727-bfbf-8dd86037f0e8_0',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://kgovindmstoragec045ce3ac.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_49f2944d-bd0e-4727-bfbf-8dd86037f0e8/azureml-logs/hyperdrive.txt?sv=2019-07-0

In [28]:

import joblib

best_hd= hd_run.get_best_run_by_primary_metric()
best_hd_metrics=best_hd.get_metrics()
print(best_hd.get_details()['runDefinition']['arguments'])
print(best_hd.get_file_names())
print('Best Run Accuracy:',best_hd_metrics['Accuracy'])
print('Best Run Id: ', best_hd.id)
print('\n Details:',best_hd_metrics)

model=best_hd.register_model(model_name='LogisticRegression-bank', model_path='outputs/model.joblib')

['--C', '3.246548569630282', '--max_iter', '150']
['azureml-logs/55_azureml-execution-tvmps_7fa4f1021d1a6b9b79b1aecef9d366aa876a7c5786f15c696f2db9e42552100c_d.txt', 'azureml-logs/65_job_prep-tvmps_7fa4f1021d1a6b9b79b1aecef9d366aa876a7c5786f15c696f2db9e42552100c_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_7fa4f1021d1a6b9b79b1aecef9d366aa876a7c5786f15c696f2db9e42552100c_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/108_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model.joblib']
Best Run Accuracy: 0.9109256449165403
Best Run Id:  HD_49f2944d-bd0e-4727-bfbf-8dd86037f0e8_0

 Details: {'Regularization Strength:': 3.246548569630282, 'Max iterations:': 150, 'Accuracy': 0.9109256449165403}


In [15]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
path= "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
data=TabularDatasetFactory.from_delimited_files(path=path)

In [16]:
from train import clean_data
from sklearn.model_selection import train_test_split
import pandas as pd

x, y = clean_data(data)
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.20)

train_df=pd.concat([x_train, y_train], axis=1)
test_df=pd.concat([x_test, y_test], axis=1)

train_df

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
8541,54,1,0,1,0,7,4,220,1,999,...,0,0,0,1,0,0,0,0,0,0
30036,28,0,0,0,0,6,3,156,3,999,...,1,0,0,0,0,0,0,1,0,0
10477,36,1,0,1,0,5,2,284,2,999,...,1,0,1,0,0,0,0,0,0,0
20101,26,0,0,1,0,7,3,182,1,999,...,0,0,0,0,0,0,0,1,0,0
1687,48,1,0,1,1,11,5,53,1,999,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31015,53,0,0,0,1,11,4,61,2,999,...,0,0,0,0,0,0,0,1,0,0
2848,44,1,0,0,1,5,4,285,1,999,...,1,0,1,0,0,0,0,0,0,0
3053,29,1,0,1,0,5,1,565,1,999,...,0,0,0,0,0,0,0,1,0,1
25653,57,1,0,1,0,8,4,74,1,999,...,0,0,0,0,0,0,0,1,0,0


In [17]:
if not os.path.isdir('data'):
    os.mkdir('data')
pd.DataFrame(train_df).to_csv("data/train_data.csv", index=False)
pd.DataFrame(test_df).to_csv("data/test_data.csv", index=False)

ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='bankmarketing', overwrite=True, show_progress=True)

Uploading an estimated of 2 files
Uploading ./data/test_data.csv
Uploaded ./data/test_data.csv, 1 files out of an estimated total of 2
Uploading ./data/train_data.csv
Uploaded ./data/train_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_673e29520b9844aa93a62fe5bf0e5d6b

In [18]:
from azureml.core.dataset import Dataset
from azureml.train.automl.utilities import get_primary_metrics

train_data = Dataset.Tabular.from_delimited_files(path=ds.path('bankmarketing/train_data.csv'))
get_primary_metrics("classification")

['accuracy',
 'precision_score_weighted',
 'norm_macro_recall',
 'AUC_weighted',
 'average_precision_score_weighted']

In [19]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig

automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    compute_target=compute_target,
    experiment_exit_score=0.99,
    primary_metric="accuracy",
    training_data=train_data,
    label_column_name='y',
    n_cross_validations=4,
    max_concurrent_iterations=30)

automl_run = exp.submit(automl_config, show_output = False)
automl_run
automl_run.wait_for_completion()

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_ba431ca8-9c0e-49bb-9019-4c0dffeb928a,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


{'runId': 'AutoML_ba431ca8-9c0e-49bb-9019-4c0dffeb928a',
 'target': 'govindk-gpunode',
 'status': 'Completed',
 'startTimeUtc': '2021-08-13T05:40:00.888342Z',
 'endTimeUtc': '2021-08-13T06:21:25.841377Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '4',
  'target': 'govindk-gpunode',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"252c3aa3-b45d-4ebb-b2f7-90e3c893195d\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.32.0", "azureml-train": "1.32.0", "azureml-train-restclients-hyperdrive": "1.32.0", "azureml-train-core": "1.32.0", "azureml-train-automl-client": "1.32.0", "azureml-tensorboard": "1.32.0", "azureml-telemetry": "1.32.0", "azureml

In [21]:
from azureml.widgets import RunDetails
RunDetails(automl_run).show()

best_run, fitted_model = automl_run.get_output()

best_run_metrics = best_run.get_metrics() 
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)
best_run.register_model(model_name = 'best_run_model', model_path = 'outputs/')

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Package:azureml-core, training version:1.33.0, current version:1.32.0
Package:azureml-dataprep, training version:2.20.1, current version:2.18.0
Package:azureml-dataprep-native, training version:38.0.0, current version:36.0.0
Package:azureml-dataprep-rslex, training version:1.18.1, current version:1.16.1
Package:azureml-dataset-runtime, training version:1.33.0, current version:1.32.0
Package:azureml-defaults, training version:1.33.0, current version:1.32.0
Package:azureml-interpret, training version:1.33.0, current version:1.32.0
Package:azureml-mlflow, training version:1.33.0, current version:1.32.0.post1
Package:azureml-pipeline-core, training version:1.33.0, current version:1.32.0
Package:azureml-telemetry, training version:1.33.0, current version:1.32.0
Package:azureml-train-automl-client, training version:1.33.0, current version:1.32.0
Package:azureml-responsibleai, training version:1.33.0
Package:azureml-train-automl-runtime, training version:1.33.0


AUC_weighted 0.947412273648971
weighted_accuracy 0.9581133466853762
AUC_macro 0.947412273648971
AUC_micro 0.9807972142460757
precision_score_macro 0.8000434802638308
precision_score_micro 0.9169954476479515
log_loss 0.32105446954468375
recall_score_micro 0.9169954476479515
norm_macro_recall 0.5015483076500307
average_precision_score_weighted 0.9556628300204176
precision_score_weighted 0.9111634007147125
recall_score_weighted 0.9169954476479515
matthews_correlation 0.5485995772414385
f1_score_weighted 0.9132916536065426
f1_score_micro 0.9169954476479515
average_precision_score_macro 0.8250552443547658
accuracy 0.9169954476479515
balanced_accuracy 0.7507741538250152
average_precision_score_micro 0.9815910653316418
f1_score_macro 0.7723080788722788
recall_score_macro 0.7507741538250152
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_ba431ca8-9c0e-49bb-9019-4c0dffeb928a_723/confusion_matrix
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_ba431ca8-9c0e-49bb-9019-4c0dff

Model(workspace=Workspace.create(name='kgovindmlwk', subscription_id='417646f6-643f-4fcd-96b8-77997bfdfac3', resource_group='kgovind-ml-wksp'), name=best_run_model, id=best_run_model:1, version=1, tags={}, properties={})

KeyError: 'log_files'