In [1]:
import warnings
import sys

import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet

import mlflow
import mlflow.sklearn

# Data

data_link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/'

In [2]:
data_path = "data/winequality-red.csv"
data = pd.read_csv(data_path, sep=";")

data.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
902,7.4,0.635,0.1,2.4,0.08,16.0,33.0,0.99736,3.58,0.69,10.8,7
437,11.1,0.45,0.73,3.2,0.066,6.0,22.0,0.9986,3.17,0.66,11.2,6
1445,7.4,0.785,0.19,5.2,0.094,19.0,98.0,0.99713,3.16,0.52,9.6,6
272,10.9,0.37,0.58,4.0,0.071,17.0,65.0,0.99935,3.22,0.78,10.1,5
93,7.7,0.49,0.26,1.9,0.062,9.0,31.0,0.9966,3.39,0.64,9.6,5


# 1. Tracking experiments

To start MLflow server,

`mlflow server --backend-store-uri mlruns/ --default-artifact-root mlruns/ --host localhost --port 5000`

In [3]:
remote_server_uri = "http://localhost:5000"
mlflow.set_tracking_uri(remote_server_uri)

In [4]:
mlflow.tracking.get_tracking_uri()

'http://localhost:5000'

In [5]:
# Set an experiment like in git creating branch or directly go to it if exists
exp_name = "Elasticnet_wine"
mlflow.set_experiment(exp_name)

<Experiment: artifact_location='mlruns/1', experiment_id='1', lifecycle_stage='active', name='Elasticnet_wine', tags={}>

# 2. Modelling

In [6]:
def eval_metrics(actual, pred):
    # compute relevant metrics
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

def load_data(data_path):
    data = pd.read_csv(data_path, sep=";")
    
    # Split data to train, test (0.75,0.25) split
    train, test = train_test_split(data)
    
    # The predicted column is 'quality' which is a scalar from [3,9]
    train_x = train.drop(['quality'], axis=1)
    test_x = test.drop(['quality'], axis=1)
    train_y = train['quality']
    test_y = test['quality']
    
    return train_x, train_y, test_x, test_y

In [7]:
def train(alpha=0.5, l1_ratio=0.5):
    # train a model with given parameter
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    # Read the wine quality csv file (make sure it is running from root directory)
    data_path = "data/winequality-red.csv"
    train_x, train_y, test_x, test_y = load_data(data_path)
    
    # Useful for multiple runs (only doing one run here)
    with mlflow.start_run():
        # Execute ElasticNet
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)
        
        # Evaluate metrics
        predicted_qualities = lr.predict(test_x)
        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
        
        # Print out the metrics
        print("ElasticNet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print(" RMSE: %s" % rmse)
        print(" MAE: %s" % mae)
        print(" R2: %s" % r2)
        
        # Log parameter, metrics, and model to MLFlow
        mlflow.log_param(key="alpha", value=alpha)
        mlflow.log_param(key="l1_ratio", value=l1_ratio)
        mlflow.log_metric(key="rmse", value=rmse)
        mlflow.log_metrics({"mae": mae, "r2": r2})        
        mlflow.log_artifact(data_path)
        print("Save to: {}".format(mlflow.get_artifact_uri()))
        
        mlflow.sklearn.log_model(lr, "model")

In [8]:
# could also do to track values over multiple steps
# with mlflow.start_run():
#     for epoch in range(0, 3):
#         mlflow.log_metric(key="quality", value=2*epoch, step=epoch)

In [14]:
train(0.5, 0.5)

ElasticNet model (alpha=0.500000, l1_ratio=0.500000):
 RMSE: 0.793164022927685
 MAE: 0.6271946374319586
 R2: 0.10862644997792625
Save to: mlruns/1/804d002393cc4502808bb7c70d105e55/artifacts


In [15]:
train(0.2, 0.2)

ElasticNet model (alpha=0.200000, l1_ratio=0.200000):
 RMSE: 0.7336400911821402
 MAE: 0.5643841279275427
 R2: 0.2373946606358417
Save to: mlruns/1/3ca04bf6e3d44452855f8d1cf0e8124a/artifacts


In [16]:
train(0.1, 0.1)

ElasticNet model (alpha=0.100000, l1_ratio=0.100000):
 RMSE: 0.7128829045893679
 MAE: 0.5462202174984664
 R2: 0.2799376066653345
Save to: mlruns/1/2f9e209a3e4a493896f0ca84499842f4/artifacts


**Note**:

* Getting some issues with running `mlflow ui` command. Getting error.
* So directly opened link, which seems to work

# 3. Tagging runs

Now we have a model which is ready for deployment. If we want to know what happened to the model in the future, is it just an experimentation, or did it go for production? So we tag it automatically.

In [5]:
from datetime import datetime
from mlflow.tracking import MlflowClient

client = MlflowClient()
experiments = client.list_experiments() # returns a list of mlflow.entities.Experiment
print(experiments)

[<Experiment: artifact_location='mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>, <Experiment: artifact_location='mlruns/1', experiment_id='1', lifecycle_stage='active', name='Elasticnet_wine', tags={}>]


In [6]:
# Get the run from run ID
_run = client.get_run('3ca04bf6e3d44452855f8d1cf0e8124a')
print(_run)

<Run: data=<RunData: metrics={'mae': 0.5643841279275427,
 'r2': 0.2373946606358417,
 'rmse': 0.7336400911821402}, params={'alpha': '0.2', 'l1_ratio': '0.2'}, tags={'mlflow.log-model.history': '[{"run_id": "3ca04bf6e3d44452855f8d1cf0e8124a", '
                             '"artifact_path": "model", "utc_time_created": '
                             '"2022-08-04 20:20:38.519622", "flavors": '
                             '{"python_function": {"model_path": "model.pkl", '
                             '"loader_module": "mlflow.sklearn", '
                             '"python_version": "3.10.4", "env": '
                             '"conda.yaml"}, "sklearn": {"pickled_model": '
                             '"model.pkl", "sklearn_version": "1.0.1", '
                             '"serialization_format": "cloudpickle", "code": '
                             'null}}, "model_uuid": '
                             '"7549740cc5794a2987ce414229d4095c", '
                             '"mlflow_vers

In [7]:
# add a tag to the run
dt = datetime.now().strftime("%d-%m-%Y (%H:%M:%S.%f)")
client.set_tag(_run.info.run_id, "deployed", dt)

In [18]:
# _run.info.run_id