In [11]:
# Import various libraries including matplotlib, sklearn, mlflow
import os
import warnings
import sys

import pandas as pd
import numpy as np
from itertools import cycle
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import lasso_path, enet_path
from sklearn import datasets

# Import mlflow
import mlflow.sklearn


def train_diabetes(in_alpha, in_l1_ratio):
    
    # Load Diabetes datasets
    diabetes = datasets.load_diabetes()
    X = diabetes.data
    y = diabetes.target

    # Create pandas DataFrame for sklearn ElasticNet linear_model
    Y = np.array([y]).transpose()
    d = np.concatenate((X, Y), axis=1)
    cols = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6', 'progression']
    data = pd.DataFrame(d, columns=cols)

    # train_diabetes
    #   Uses the sklearn Diabetes dataset to predict diabetes progression using ElasticNet
    #       The predicted "progression" column is a quantitative measure of disease progression one year after baseline
    #       http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html

    # Evaluate metrics
    def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2

    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "progression" which is a quantitative measure of disease progression one year after baseline
    train_x = train.drop(["progression"], axis=1)
    test_x = test.drop(["progression"], axis=1)
    train_y = train[["progression"]]
    test_y = test[["progression"]]

    if float(in_alpha) is None:
        alpha = 0.05
    else:
        alpha = float(in_alpha)
    
    if float(in_l1_ratio) is None:
        l1_ratio = 0.05
    else:
        l1_ratio = float(in_l1_ratio)
  
    # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
    with mlflow.start_run():
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)

        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        # Print out ElasticNet model metrics
        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Set tracking_URI first and then reset it back to not specifying port
        # Note, we had specified this in an earlier cell
        #mlflow.set_tracking_uri(mlflow_tracking_URI)
        
        # Log mlflow attributes for mlflow UI
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(lr, "model")

    
#         # Call plot_enet_descent_path
#         image = plot_enet_descent_path(X, y, l1_ratio)
    
#         # Log artifacts (output files)
#         mlflow.log_artifact("ElasticNet-paths.png")

In [12]:
train_diabetes(0.5, 0.5)

Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 81.81926483955999
  MAE: 69.22549116101793
  R2: -0.012188274969180402


In [13]:
train_diabetes(0.1, 0.9)

Elasticnet model (alpha=0.100000, l1_ratio=0.900000):
  RMSE: 71.98302888908191
  MAE: 60.5647520017933
  R2: 0.2165516143465459


In [14]:
train_diabetes(0.01, 0.01)

Elasticnet model (alpha=0.010000, l1_ratio=0.010000):
  RMSE: 71.40362571026475
  MAE: 60.09124839565383
  R2: 0.2291130640003659
