In [1]:
from azureml.core import Workspace, Experiment, Run
from azureml.train.estimator import Estimator
from azureml.train.hyperdrive import HyperDriveRunConfig, PrimaryMetricGoal, BayesianParameterSampling, uniform

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")


In [2]:
%%writefile train.py

from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.externals import joblib

# Create logger from current context
# If this is run in AML Compute, it will return the current run
# If this runs locally, it'll only print metrics to standard out
run = Run.get_context()

def load_train_test_dataset(data_path, file_name):
    # Load DF from CSV
    diabetes_df = pd.read_csv(os.path.join(data_path, file_name))
    
    # Split out X and Y variables
    y = diabetes_df.pop('target').values
    X = diabetes_df.values
    
    # Split training and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    print (f"Data contains {len(X_train)} training samples and {len(X_test)} test samples")
    return X_train, X_test, y_train, y_test
    
def train_elasticnet(X, y, alpha, l1_ratio):
    run.log('model_type', 'ElasticNet')

    # Create a new model object
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, copy_X=True, random_state=40760)
    
    # Fit the model and return
    model.fit(X, y)

    # Save model to "outputs" folder
    os.makedirs('./outputs', exist_ok=True)
    joblib.dump(value=model, filename='./outputs/model.pkl')

    return model

def predict_and_log_performance(model, X_test, y_test):
    # Get the predicted values
    preds = model.predict(X_test)
    
    # Log the metrics to the AML run
    run.log("rmse", np.sqrt(mean_squared_error(y_test, preds)))
    run.log('mae', mean_absolute_error(y_test, preds))
    run.log('r2', r2_score(y_test, preds))

    return preds

def plot_residuals(y, y_hat):
    # Calc residuals
    resids = y - y_hat
    
    # Generate a new Seaborn plot
    fig = plt.figure()
    sns.regplot(y, resids)
    
    # Change the axis labels
    plt.xlabel("Actual Value")
    plt.ylabel("Residuals")
    
    # Close the figure
    plt.close(fig)

    # Log to AML and return
    run.log_image('residuals', plot=fig)
    return fig

    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-path', '-d', type=str, required=True,
                        help="The path where the data file is located")
    parser.add_argument('--alpha', '-a', type=float,
                        help="The alpha value for training", default=0.03)
    parser.add_argument('--file-name', '-f', type=str, default='diabetes.csv',
                        help="The file name of the diabetes csv dataset")
    parser.add_argument('--l1-ratio', type=float, default=0.05,
                        help='The l1_ratio of the Scikit-Learn ElasticNet model')

    args = parser.parse_args()
    
    X_train, X_test, y_train, y_test = load_train_test_dataset(args.data_path, args.file_name)

    # log the hyperparameters
    run.log('alpha', args.alpha)
    run.log('l1_ratio', args.l1_ratio)
    
    # Train the model 
    model = train_elasticnet(X=X_train, 
                             y=y_train, 
                             alpha=args.alpha, 
                             l1_ratio=args.l1_ratio)

    # Generate predictions
    preds = predict_and_log_performance(model=model, 
                                        X_test=X_test, 
                                        y_test=y_test)
    
    # Plot the residuals
    fig = plot_residuals(y_test, preds)

Overwriting train.py


## Test File Locally

In [3]:
%%cmd
python train.py --data-path ../../data

Microsoft Windows [Version 10.0.17763.379]
(c) 2018 Microsoft Corporation. All rights reserved.

(azureml-sdk) C:\Users\erikzwi\Projects\AML-Demo-Diabetes\code\02-aml_compute>python train.py --data-path ../../data
Data contains 353 training samples and 89 test samples
Attempted to log scalar metric alpha:
0.03
Attempted to log scalar metric l1_ratio:
0.05
Attempted to log scalar metric model:
ElasticNet
Attempted to log scalar metric rmse:
66.39460136852927
Attempted to log scalar metric mae:
54.66786232493984
Attempted to log scalar metric r2:
0.1403459788724033
Attempted to log image metric residuals:
Figure(640x480)

(azureml-sdk) C:\Users\erikzwi\Projects\AML-Demo-Diabetes\code\02-aml_compute>

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


## Submit to AML Compute for Computation

In [4]:
experiment_name = 'diabetes_regression'
ws = Workspace.from_config()

# Attach to experiment
experiment = Experiment(workspace=ws, name=experiment_name)

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


Found the config file in: C:\Users\erikzwi\Projects\AML-Demo-Diabetes\code\aml_config\config.json


In [19]:
# Define the datastore being used and the cpu_cluster being used
datastore = ws.datastores['diabetes']
cpu_cluster = ws.compute_targets['cpu-cluster']


In [20]:
args = {"--data-path": datastore.as_mount()}

est = Estimator(source_directory=".",
                compute_target=cpu_cluster,
                entry_script="train.py",
                script_params=args,
                conda_packages=['scikit-learn', 'seaborn', 'pandas']
                )

In [21]:
run = experiment.submit(est)

In [22]:
# the first time you run this, it might take a bit of time to build the container (took me 8 minutes in test)

from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

## Train Hyperparameters

In [15]:
sampled_params = BayesianParameterSampling({
                        '--alpha': uniform(0, 1),
                        '--l1-ratio': uniform(0, 1)
                        })

hyper_run_config = HyperDriveRunConfig(estimator=est,
                                      hyperparameter_sampling=sampled_params,
                                      primary_metric_name='rmse',
                                      primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                                      max_total_runs=15,
                                      max_concurrent_runs=3)

In [16]:
hd_run = experiment.submit(hyper_run_config)
RunDetails(hd_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

# Your Turn:
Choose another Scikit Learn algorithm -  again check out [Scikit-Learn.ensemble](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble) or [Scikit-Learn.linear_model](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model) for other algorithm to try.

__Challenge:__ Have the model type be a Hyperparameter that is set. _Hint: Check out `azureml.train.hyperdrive.choice`_