# Diabetes dataset

In [5]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn import datasets

import mlflow
import mlflow.sklearn

## Data preparation

In [1]:
# Load Diabetes datasets
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

# Create pandas DataFrame for sklearn ElasticNet linear_model
Y = np.array([y]).transpose()
d = np.concatenate((X, Y), axis=1)
cols = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6', 'progression']
data = pd.DataFrame(d, columns=cols)

# Evaluate metrics
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

warnings.filterwarnings("ignore")
np.random.seed(40)

# Split the data into training and test sets. (0.75, 0.25) split.
train, test = train_test_split(data)

# The predicted column is "progression" which is a quantitative measure of disease progression one year after baseline
train_x = train.drop(["progression"], axis=1)
test_x = test.drop(["progression"], axis=1)
train_y = train[["progression"]]
test_y = test[["progression"]]

## Demo without MLflow

In [6]:
# alpha is the penalisation - l1_ratio = 1 is the lasso penalty - l1_ratio = 0 is the ridge penalty
lr = ElasticNet(alpha=0.01, l1_ratio=0.25, random_state=42)
lr.fit(train_x, train_y)

predicted_qualities = lr.predict(test_x)

(rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

# Print out ElasticNet model metrics
print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (0.01, 0.25))
print("  RMSE: %s" % rmse)
print("  MAE: %s" % mae)
print("  R2: %s" % r2)


Elasticnet model (alpha=0.010000, l1_ratio=0.250000):
  RMSE: 69.95922060492441
  MAE: 58.727387642314135
  R2: 0.259985752578914


In [7]:
# alpha is the penalisation - l1_ratio = 1 is the lasso penalty - l1_ratio = 0 is the ridge penalty
lr = ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=42)
lr.fit(train_x, train_y)

predicted_qualities = lr.predict(test_x)

(rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

# Print out ElasticNet model metrics
print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (0.01, 0.5))
print("  RMSE: %s" % rmse)
print("  MAE: %s" % mae)
print("  R2: %s" % r2)


Elasticnet model (alpha=0.010000, l1_ratio=0.500000):
  RMSE: 67.98354871814037
  MAE: 56.73838604280737
  R2: 0.30119208073981807


In [8]:
# alpha is the penalisation - l1_ratio = 1 is the lasso penalty - l1_ratio = 0 is the ridge penalty
lr = ElasticNet(alpha=0.01, l1_ratio=0.75, random_state=42)
lr.fit(train_x, train_y)

predicted_qualities = lr.predict(test_x)

(rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

# Print out ElasticNet model metrics
print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (0.01, 0.75))
print("  RMSE: %s" % rmse)
print("  MAE: %s" % mae)
print("  R2: %s" % r2)


Elasticnet model (alpha=0.010000, l1_ratio=0.750000):
  RMSE: 65.28994906390733
  MAE: 53.75914828434926
  R2: 0.35547047607749416


In [9]:
# alpha is the penalisation - l1_ratio = 1 is the lasso penalty - l1_ratio = 0 is the ridge penalty
lr = ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=42)
lr.fit(train_x, train_y)

predicted_qualities = lr.predict(test_x)

(rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

# Print out ElasticNet model metrics
print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (0.001, 0.5))
print("  RMSE: %s" % rmse)
print("  MAE: %s" % mae)
print("  R2: %s" % r2)


Elasticnet model (alpha=0.001000, l1_ratio=0.500000):
  RMSE: 63.01112461968615
  MAE: 50.87364330131877
  R2: 0.3996775038471697


## Demo with MLflow

In [10]:
def train_diabetes(in_alpha, in_l1_ratio):
    
    if float(in_alpha) is None:
        alpha = 0.05
    else:
        alpha = float(in_alpha)
    
    if float(in_l1_ratio) is None:
        l1_ratio = 0.05
    else:
        l1_ratio = float(in_l1_ratio)
  
    # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
    with mlflow.start_run():
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)

        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        # Print out ElasticNet model metrics
        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log mlflow attributes for mlflow UI
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(lr, "model")


In [11]:
train_diabetes(0.001, 1)

Elasticnet model (alpha=0.001000, l1_ratio=1.000000):
  RMSE: 63.27750123317956
  MAE: 50.92096285436428
  R2: 0.39459110411953346


In [12]:
train_diabetes(0.001, 0.25)

Elasticnet model (alpha=0.001000, l1_ratio=0.250000):
  RMSE: 63.1646599850253
  MAE: 51.05887951773729
  R2: 0.39674840116161647


In [13]:
train_diabetes(0.001, 0.5)

Elasticnet model (alpha=0.001000, l1_ratio=0.500000):
  RMSE: 63.01112461968615
  MAE: 50.87364330131877
  R2: 0.3996775038471697


In [14]:
train_diabetes(0.001, 0.75)

Elasticnet model (alpha=0.001000, l1_ratio=0.750000):
  RMSE: 63.000141912875
  MAE: 50.92666300886525
  R2: 0.39988675551248043


In [15]:
train_diabetes(0.001, 0)

Elasticnet model (alpha=0.001000, l1_ratio=0.000000):
  RMSE: 63.39890631425896
  MAE: 51.32389750703385
  R2: 0.39226578395239375


In [16]:
train_diabetes(0.01, 1)

Elasticnet model (alpha=0.010000, l1_ratio=1.000000):
  RMSE: 63.24666777876805
  MAE: 51.051828604086325
  R2: 0.3951809598912357


In [17]:
train_diabetes(0.01, 0.25)

Elasticnet model (alpha=0.010000, l1_ratio=0.250000):
  RMSE: 69.95922060492441
  MAE: 58.727387642314135
  R2: 0.259985752578914


In [18]:
train_diabetes(0.01, 0.5)

Elasticnet model (alpha=0.010000, l1_ratio=0.500000):
  RMSE: 67.98354871814037
  MAE: 56.73838604280737
  R2: 0.30119208073981807


In [19]:
train_diabetes(0.01, 0.75)

Elasticnet model (alpha=0.010000, l1_ratio=0.750000):
  RMSE: 65.28994906390733
  MAE: 53.75914828434926
  R2: 0.35547047607749416


In [20]:
train_diabetes(0.01, 0)

Elasticnet model (alpha=0.010000, l1_ratio=0.000000):
  RMSE: 71.45646564111745
  MAE: 60.13924795969087
  R2: 0.22797170223391094


## Pipeline, GridSearch & Cross validation  - without MLflow

In [18]:
lr = ElasticNet(random_state=42)

# GridSearch
param_grid = [{'alpha': np.logspace(-3, 3, 7),
               'l1_ratio': [1, 0.25, 0.5, 0,75, 0],
               }]

estimator = GridSearchCV(lr, param_grid, cv=5, scoring='neg_mean_squared_error')

estimator.fit(train_x, train_y)

predicted_qualities = estimator.predict(test_x)

# Print out ElasticNet model metrics
print("Elasticnet model (alpha={}, l1_ratio={}):".format(estimator.best_params_['alpha'], 
                                                          estimator.best_params_['l1_ratio']))
print("  RMSE: %s" % rmse)
print("  MAE: %s" % mae)
print("  R2: %s" % r2)

Elasticnet model (alpha=0.001, l1_ratio=1):
  RMSE: 67.98354871814037
  MAE: 56.73838604280737
  R2: 0.30119208073981807


In [20]:
lr = ElasticNet(random_state=42)

# GridSearch
param_grid = [{'alpha': np.logspace(-3, 3, 7),
               'l1_ratio': [1, 0.25, 0.5, 0,75, 0],
               'tol': [0.00001, 0.0001, 0.001]
               }]

estimator = GridSearchCV(lr, param_grid, cv=5, scoring='neg_mean_squared_error')

estimator.fit(train_x, train_y)

predicted_qualities = estimator.predict(test_x)

# Print out ElasticNet model metrics
print("Elasticnet model (alpha={}, l1_ratio={}, tol={}):".format(estimator.best_params_['alpha'], 
                                                                 estimator.best_params_['l1_ratio'],
                                                                 estimator.best_params_['tol']))
print("  RMSE: %s" % rmse)
print("  MAE: %s" % mae)
print("  R2: %s" % r2)

Elasticnet model (alpha=0.001, l1_ratio=1, tol=1e-05):
  RMSE: 67.98354871814037
  MAE: 56.73838604280737
  R2: 0.30119208073981807


### With PCA

In [25]:
pipeline = Pipeline([
    ('pca', PCA(random_state=42)),
    ('lr', ElasticNet(random_state=42))
])

# GridSearch
param_grid = [{'pca__n_components': [2, 3, 4, 5],
               'lr__alpha': np.logspace(-3, 3, 7),
               'lr__l1_ratio': [1, 0.25, 0.5, 0,75, 0],
               }]

estimator = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

estimator.fit(train_x, train_y)

predicted_qualities = estimator.predict(test_x)

# Print out ElasticNet model metrics
print("Elasticnet model (n_components={}, alpha={}, l1_ratio={}):".format(estimator.best_params_['pca__n_components'], 
                                                                          estimator.best_params_['lr__alpha'],
                                                                          estimator.best_params_['lr__l1_ratio']))
print("  RMSE: %s" % rmse)
print("  MAE: %s" % mae)
print("  R2: %s" % r2)

Elasticnet model (n_components=4, alpha=0.001, l1_ratio=1):
  RMSE: 67.98354871814037
  MAE: 56.73838604280737
  R2: 0.30119208073981807


## Pipeline, GridSearch & Cross validation  - with MLflow

Not there yet!


[Adding hyperparamters tracking example #326](https://github.com/mlflow/mlflow/issues/326)

In [27]:
def train_diabetes(in_alpha, in_l1_ratio):
    
    if float(in_alpha) is None:
        alpha = 0.05
    else:
        alpha = float(in_alpha)
    
    if float(in_l1_ratio) is None:
        l1_ratio = 0.05
    else:
        l1_ratio = float(in_l1_ratio)
  
    # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
    with mlflow.start_run():
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)

        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        # Print out ElasticNet model metrics
        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log mlflow attributes for mlflow UI
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(lr, "model")
