# 01 - Training a Diabetes Regression Model In Notebook

In [None]:
import environs

e_vars = environs.Env()
e_vars.read_env('../workshop.env')

USER_NAME = e_vars.str("USER_NAME")
EXPERIMENT_NAME = e_vars.str('EXPERIMENT_NAME')
ENVIRONMENT_NAME = e_vars.str("ENVIRONMENT_NAME")
DATASET_NAME = e_vars.str("DATASET_NAME")

if not USER_NAME:
    raise NotImplementedError("Please enter your username in the `.env` file and run this cell again.")

In [None]:
%matplotlib inline
import azureml.core
from azureml.core import Experiment, Workspace
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import joblib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [None]:
DATA_PATH = "../../data/"

ws = Workspace.from_config()
print()
print(f'Workspace name:\t{ws.name}', 
      f'Azure region:\t{ws.location}',
      f'Subscription:\t{ws.subscription_id}',
      f'Resource group:\t{ws.resource_group}',
      sep='\n')

In [None]:
experiment = Experiment(workspace=ws, name=EXPERIMENT_NAME)

---

## Data
We will use the diabetes dataset for this experiement, a well-known small dataset that comes with scikit-learn.  This cell loads the dataset and splits it into random training and testing sets.


In [None]:
diabetes_df = pd.read_csv(os.path.join(DATA_PATH,"diabetes.csv"))

diabetes_df.head(5)

In [None]:
y = diabetes_df.pop('target').values
X = diabetes_df.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print (f"Data contains {len(X_train)} training samples and {len(X_test)} test samples")

---
## Train

Let's use scikit-learn to train a simple Ridge regression model.  We use AML to record interesting information about the model in an Experiment.  An Experiment contains a series of trials called Runs.  During this trial we use AML in the following way:
* We access an experiment from our AML workspace by name, which will be created if it doesn't exist
* We use `with experiment.start_logging() as run` to create a new run in this experiment in a Python `with` statement context manager.
* We use `run.log()` to record a parameter, alpha, and an accuracy measure - the Mean Squared Error (MSE) to the run.  We will be able to review and compare these measures in the Azure Portal at a later time.
* We store the resulting model in the **outputs** directory, which is automatically captured by AML when the run is complete.

In [None]:
def plot_residuals_v_actuals(y, y_hat):
    """Residuals (y-axis) vs. Actuals (x-axis) - colored green"""
    resids = y - y_hat
    
    fig = plt.figure()
    sns.regplot(y, resids, color='g')
    
    plt.title('Residual vs. Actual')
    plt.xlabel("Actual Value")
    plt.ylabel("Residuals")
    
    plt.close(fig)
    return fig

def plot_predictions(y, y_hat):
    """Predictions (y-axis) vs. Actuals (x-axis)"""
    fig = plt.figure()
    
    sns.regplot(y, y_hat, color='b')
    
    plt.title("Prediction vs. Actual")
    plt.xlabel("Actual Value")
    plt.ylabel("Predicted Value")
    
    plt.close(fig)
    return fig

def plot_resid_histogram(y, y_hat):
    resids = y - y_hat 

    fig = plt.figure()
    sns.distplot(resids, color='g')
    
    plt.title("Residual Histogram")
    
    plt.close(fig)
    return fig

In [None]:
# Create a run object in the experiment
with experiment.start_logging(tags={"Context": "Notebook"}) as run:
    # Log the algorithm parameter alpha to the run
    alpha = 0.03
    
    # Create, fit, and test the scikit-learn Ridge regression model
    regression_model = Ridge(alpha=alpha)
    regression_model.fit(X_train, y_train)
    preds = regression_model.predict(X_test)

    # Output the Mean Squared Error to the notebook and to the run
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print(f"RMSE:\t{np.round(rmse,4)}",
          f"MAE:\t{np.round(mae,4)}",
          f"R2:\t{np.round(r2,4)}",
          sep='\n')
   
    resid_fig = plot_residuals_v_actuals(y_test, preds)
    resid_hist = plot_resid_histogram(y_test, preds)
    pred_plt = plot_predictions(y_test, preds)
    
    
    # Log metrics to Azure ML
    # THIS IS THE ONLY AML SPECIFIC CODE HERE #
    run.log('alpha', alpha)
    run.log('model_type', 'Ridge')
    run.log('rmse', rmse)
    run.log('mae', mae)
    run.log('r2', r2)
    run.log_image(name='residuals-v-actuals', plot=resid_fig)
    run.log_image(name='residuals-histogram', plot=resid_hist)
    run.log_image(name='prediction-v-actual', plot=pred_plt)
    
    # Save the model to the outputs directory for capture
    # Anything saved to ./outputs/ folder will be sent to Azure ML 
    # at the end of the run
    joblib.dump(value=regression_model, filename='outputs/model.pkl')
    
display(pred_plt, resid_fig, resid_hist)

In [None]:
#let's view the run in the portal
run

### Viewing experiment results
Similar to viewing the run, we can also view the entire experiment.  The experiment report view in the Azure portal lets us view all the runs in a table, and also allows us to customize charts.  This way, we can see how the alpha parameter impacts the quality of the model

In [None]:
# now let's take a look at the experiment in Azure portal.
experiment

<br><br><br><br><br>






###### Copyright (c) Microsoft Corporation. All rights reserved.  
###### Licensed under the MIT License.