![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.png)

In [1]:
!git pull

Already up to date.


In [2]:
import logging

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.datastore import Datastore
from azureml.core.dataset import Dataset
from azureml.data.data_reference import DataReference
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
print(azureml.core.VERSION) #should be 1.0.57 or greater

1.0.57


In [3]:
ws = Workspace.from_config()
experiment_name = 'house_prices_regression'
project_folder = './sample_projects/house_prices_regression'
experiment = Experiment(ws, experiment_name)

## Data
This uses the AzureML datastore and dataset api's, as well as data manipulation, with keep_ and drop_ columns, and random_split.

In [4]:
datastore_name = 'edvanstorage__azureml'
dataset_path = 'boston_houses/boston_data.csv'
datastore = Datastore.get(ws, datastore_name)
data_reference = DataReference(datastore, data_reference_name="boston_data", path_on_datastore=dataset_path)
boston_ds = Dataset.Tabular.from_delimited_files(data_reference)
boston_ds = boston_ds.drop_columns('Column1')
boston_ds.take(3).to_pandas_dataframe()

Unnamed: 0,CrimeRate,ResidentialZoning,IndustrialZoning,OnRiver,NOXConcentration,NumberOfRooms,PreWarHouses,DistanceToEmployment,DistanceToHighways,PropertyTaxRate,ParentTeachRatio,ProportionAA,LowerStatusProportion,Price
0,0.01,18.0,2.31,0.0,0.54,6.58,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.03,0.0,7.07,0.0,0.47,6.42,78.9,4.97,2.0,242.0,17.8,396.9,9.14,21.6
2,0.03,0.0,7.07,0.0,0.47,7.18,61.1,4.97,2.0,242.0,17.8,392.83,4.03,34.7


In [5]:
b_train, b_test = boston_ds.random_split(0.2, seed=84)
X_train = b_train.drop_columns('Price')
y_train = b_train.keep_columns('Price')
X_test = b_test.drop_columns('Price')
y_test = b_test.keep_columns('Price')

## Remote compute
Setup the environments for remote compute, we might not need it...

In [6]:
compute_target = ws.compute_targets['cpucluster']
run_config = RunConfiguration(framework="python")
run_config.target = compute_target
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE

dependencies = CondaDependencies.create(
    pip_packages=["scikit-learn", "scipy", "numpy"])
run_config.environment.python.conda_dependencies = dependencies

## Train

Instantiate an `AutoMLConfig` object to specify the settings for both local and remote runs and data used to run the experiment.

In [7]:
aml_config = {"task": 'regression',
         "iteration_timeout_minutes": 10,
         "primary_metric": 'normalized_root_mean_squared_error',
         "debug_log": 'automl.log',
         "verbosity": logging.INFO,
         "enable_early_stopping": True,
         "X": X_train, 
         "y": y_train,
         "path": project_folder}

local_run = False
if local_run:
    numb_run_config = {
        'iterations': 10,
        'n_cross_validations': 5
    }
else:
    numb_run_config = {
        'iterations': 250,
        'n_cross_validations': 10
    }    
    remote_config = {
        'compute_target': compute_target,
        'run_configuration': run_config,
        'max_cores_per_iteration': 2,
        'max_concurrent_iterations': 10,
    }
    aml_config.update(remote_config)
aml_config.update(numb_run_config)
    
automl_config = AutoMLConfig(**aml_config)

In [8]:
show_run = False
if 'compute_target' in aml_config:
    print(f"Running {aml_config['iterations']} iterations on remote compute (with {compute_target.vm_size} VM's), with {aml_config['n_cross_validations']}-fold cross validation.")
else:
    print(f"Running {aml_config['iterations']} iterations on local compute, with {aml_config['n_cross_validations']}-fold cross validation.")
if not show_run:
    print("Not showing output here. Please run the Widget cell below.")
else:
    print("---------------------------------------------------------------------------------")
run = experiment.submit(automl_config, show_output = show_run)

Running 250 iterations on remote compute (with STANDARD_D2_V2 VM's), with 10-fold cross validation.
Not showing output here. Please run the Widget cell below.


In [9]:
from azureml.widgets import RunDetails
RunDetails(run).show() 

ModuleNotFoundError: No module named 'azureml.widgets'

In [10]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
house_prices_regression,AutoML_f21709b6-891f-454f-b677-f0b6b8bb626f,automl,Starting,Link to Azure Portal,Link to Documentation



#### Retrieve All Child Runs
You can also use SDK methods to fetch all the child runs and see individual metrics that we log.

In [None]:
children = list(run.get_children())
metricslist = {}
for r in children:
    properties = r.get_properties()
    metrics = {k: v for k, v in r.get_metrics().items() if isinstance(v, float)}
    metricslist[int(properties['iteration'])] = metrics

rundata = pd.DataFrame(metricslist).sort_index(1)
rundata

### Retrieve the Best Model

Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing.  Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*.

In [None]:
best_run, fitted_model = run.get_output()
print(best_run)
print("----------------------------")
print(fitted_model)

#### Best Model Based on Any Other Metric
Show the run and the model that has the smallest `root_mean_squared_error` value (which turned out to be the same as the one with largest `spearman_correlation` value):

In [None]:
lookup_metric = "spearman_correlation"
best_run, fitted_model = run.get_output(metric = lookup_metric)
print(best_run)
print("----------------------------")
print(fitted_model)

#### Model from a Specific Iteration
Show the run and the model from the third iteration:

In [None]:
iteration = 3
third_run, third_model = run.get_output(iteration = iteration)
print(third_run)
print("----------------------------")
print(third_model)

## Test

Predict on training and test set, and calculate residual values.

In [None]:
X_train_pd = X_train.to_pandas_dataframe()
y_train_pd = y_train.to_pandas_dataframe().squeeze()
X_test_pd = X_test.to_pandas_dataframe()
y_test_pd = y_test.to_pandas_dataframe().squeeze()

y_pred_train = fitted_model.predict(X_train_pd)
y_residual_train = y_train_pd - y_pred_train

y_pred_test = fitted_model.predict(X_test_pd)
y_residual_test = y_test_pd - y_pred_test

In [None]:
%matplotlib inline
from sklearn.metrics import mean_squared_error, r2_score

# Set up a multi-plot chart.
f, (a0, a1) = plt.subplots(1, 2, gridspec_kw = {'width_ratios':[1, 1], 'wspace':0, 'hspace': 0})
f.suptitle('Regression Residual Values', fontsize = 18)
f.set_figheight(6)
f.set_figwidth(16)

# Plot residual values of training set.
a0.axis([0, 360, -200, 200])
a0.plot(y_residual_train, 'bo', alpha = 0.5)
a0.plot([-10,360],[0,0], 'r-', lw = 3)
a0.text(16,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_train_pd, y_pred_train))), fontsize = 12)
a0.text(16,140,'R2 score = {0:.2f}'.format(r2_score(y_train_pd, y_pred_train)), fontsize = 12)
a0.set_xlabel('Training samples', fontsize = 12)
a0.set_ylabel('Residual Values', fontsize = 12)

# Plot a histogram.
a0.hist(y_residual_train, orientation = 'horizontal', color = 'b', bins = 10, histtype = 'step')
a0.hist(y_residual_train, orientation = 'horizontal', color = 'b', alpha = 0.2, bins = 10)

# Plot residual values of test set.
a1.axis([0, 90, -200, 200])
a1.plot(y_residual_test, 'bo', alpha = 0.5)
a1.plot([-10,360],[0,0], 'r-', lw = 3)
a1.text(5,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_test_pd, y_pred_test))), fontsize = 12)
a1.text(5,140,'R2 score = {0:.2f}'.format(r2_score(y_test_pd, y_pred_test)), fontsize = 12)
a1.set_xlabel('Test samples', fontsize = 12)
a1.set_yticklabels([])

# Plot a histogram.
a1.hist(y_residual_test, orientation = 'horizontal', color = 'b', bins = 10, histtype = 'step')
a1.hist(y_residual_test, orientation = 'horizontal', color = 'b', alpha = 0.2, bins = 10)

plt.show()

In [None]:
all_rmse = {}
for ite in range(aml_config['iterations']):
    i_run, model = run.get_output(iteration = ite)
    y_pred_test = model.predict(X_test_pd)
    all_rmse[ite] = np.sqrt(mean_squared_error(y_test_pd, y_pred_test))

In [None]:
all_rmse

In [None]:
best_iteration = sorted(all_rmse, key=all_rmse.get, reverse=False)[0]
print(f'Best iteration is number {best_iteration}')
print("--------------")
best_test_run, best_test_model = run.get_output(iteration = best_iteration)
print(best_test_run)
print(best_test_run.get_file_names())
print("--------------")
print(best_test_model)
# print(best_test_model.get_model_path())

In [None]:
dependencies = run.get_run_sdk_dependencies(iteration = best_iteration)

In [None]:
for p in ['azureml-train-automl', 'azureml-core']:
    print('{}\t{}'.format(p, dependencies[p]))

In [None]:
myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn','py-xgboost==0.80'], pip_packages=['azureml-train-automl'])

conda_env_file_name = 'conda_dependencies.yml'
myenv.save_to_file('.', conda_env_file_name)

## Register model and commit dependencies and score files first

Since the Release is triggered from a new version of the model, first the dependent files are updated in the repo, then the new model is registered, which triggers the release.

In [None]:
dep_changes = !git diff conda_dependencies.yml
score_changes = !git diff score.py
inference_changes = !git diff inference_config.json
aci_config_changes = !git diff aci_deployment_config.json
aks_config_changes = !git diff aks_deployment_config.json
if len(dep_changes) > 0:
    !git add conda_dependencies.yml
if len(score_changes) > 0:
    !git add score.py
if len(inference_changes) > 0:
    !git add inference_config.json
if len(aci_config_changes) > 0:
    !git add aci_deployment_config.json
if len(aks_config_changes) > 0:
    !git add aks_deployment_config.json
if len(dep_changes)+len(score_changes)+len(inference_changes)+len(aci_config_changes)+len(aks_config_changes)>0:
    print("Changes to commit. Committing and pushing now")
    !git commit -m "Commit from NB"
    !git push
else:
    print("No changes to commit.")

In [None]:
name = "house_prices_regression"
deploy = True

# best_test_model.register(ws, )
model = best_test_run.register_model(name, model_path='outputs/model.pkl')
print(model.name, model.id, model.version, sep='\t')