# MLFlow demo

## 1. Setup

### 1.1. Generic imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

### 1.2. ML imports

In [2]:
SEED = 42

import numpy as np
np.random.seed(SEED)

import pandas as pd

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV

import h2o
from h2o.automl import H2OAutoML

import mlflow
import mlflow.sklearn

from utils.metrics import get_metrics

### 1.3. Global variables

In [3]:
TARGET_VAR = 'class'
TEST_RATIO = 0.25
CV_SCORING = 'neg_mean_absolute_error'

## 2. Preprocessing

### 2.1. Load dataset

In [4]:
# Load IRIS example dataset from Scikit-learn
from sklearn.datasets import load_iris

data = load_iris(return_X_y=False)
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.DataFrame(data.target, columns=[TARGET_VAR])

data_df = pd.concat(
    [X, y],
    axis=1,
    ignore_index=False
)
data_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### 2.2. Feature normalization

To be determined

### 2.3. Feature engineering

To be determined

### 2.4. Feature selection

To be determined

### 2.5. Train-test split

In [5]:
# Split the data into training and test sets
train, test = train_test_split(
    data_df,
    test_size=TEST_RATIO,
    random_state=SEED
)

X_train = train.drop([TARGET_VAR], axis=1)
X_test = test.drop([TARGET_VAR], axis=1)

y_train = train[TARGET_VAR].values
y_test = test[TARGET_VAR].values

## 3. Training

### 3.2. Selection of the best model

#### 3.2.1. AutoML (H2O)

In [6]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.4" 2019-07-16; OpenJDK Runtime Environment (build 11.0.4+11-post-Ubuntu-1ubuntu218.04.3); OpenJDK 64-Bit Server VM (build 11.0.4+11-post-Ubuntu-1ubuntu218.04.3, mixed mode, sharing)
  Starting server from /home/willy/anaconda3/envs/mlops/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpwfelm32k
  JVM stdout: /tmp/tmpwfelm32k/h2o_willy_started_from_python.out
  JVM stderr: /tmp/tmpwfelm32k/h2o_willy_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Europe/Madrid
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.10
H2O cluster version age:,24 days
H2O cluster name:,H2O_from_python_willy_w89l9m
H2O cluster total nodes:,1
H2O cluster free memory:,3.891 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [7]:
train_h2o = h2o.H2OFrame(train)
test_h2o = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [8]:
automl_model = H2OAutoML(max_runtime_secs=60, seed=SEED, project_name='mlflow_demo')
automl_model.train(y=TARGET_VAR, training_frame=train_h2o, leaderboard_frame=test_h2o)

AutoML progress: |████████████████████████████████████████████████████████| 100%


#### 3.2.2. Linear model

In [9]:
hyperparams_lasso = {
    "alpha": [0.25, 0.55, 0.75],
    "selection": ["random", "cyclic"],
    "fit_intercept": [False, True],
    "normalize": [False, True],
    "max_iter": [500],
    "positive": [False, True]
}

linear_model = GridSearchCV(
    Lasso(),
    param_grid=hyperparams_lasso,
    cv=10,
    scoring=CV_SCORING,
    n_jobs=-1,
)
linear_model.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'alpha': [0.25, 0.55, 0.75],
                         'fit_intercept': [False, True], 'max_iter': [500],
                         'normalize': [False, True], 'positive': [False, True],
                         'selection': ['random', 'cyclic']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_absolute_error', verbose=0)

### 3.3. Model evaluation

In [10]:
# Get predictions for all interesting models
y_train_automl = automl_model.predict(train_h2o).as_data_frame().values
y_test_automl = automl_model.predict(test_h2o).as_data_frame().values
y_test_linear = linear_model.predict(X_test)

# Build dictionary with all predictions
sets_dict = {
    'Train (AutoML)': (y_train, y_train_automl),
    'Test (AutoML)': (y_test, y_test_automl),
    'Test (Linear)': (y_test, y_test_linear),
}

# Build dataframe with all metrics ordered
metrics_df = get_metrics(sets_dict)
metrics_df.head()

gbm prediction progress: |████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


Unnamed: 0,Train (AutoML),Test (AutoML),Test (Linear)
R2,0.9998,0.9961,0.8825
Explained variance,0.9998,0.9961,0.8844
Max error,0.0428,0.2126,0.5816
Mean absolute error,0.0081,0.0264,0.2528
Median absolute error,0.0059,0.0064,0.2359


### 3.4. MLFlow logging

In [11]:
automl_model_params = automl_model.leader.get_params()
automl_model_metrics = metrics_df['Test (AutoML)'].reset_index().rename(columns={'index': 'Metric name'})
automl_model_name = automl_model.leader.__name__

In [12]:
with mlflow.start_run():
    mlflow_run_id = mlflow.active_run().info.run_uuid
    
    # Log params
    for param_key, param_value in automl_model_params.items():
        mlflow.log_param(param_key, param_value)

    # Log metrics
    for _, metric_row in automl_model_metrics.iterrows():
        mlflow.log_metric(metric_row['Metric name'], metric_row['Test (AutoML)'])

    # Log models
    mlflow.sklearn.log_model(automl_model, automl_model_name)
    print(f'Model saved in run {mlflow_run_id}')

  from collections import (
  class ResultIterable(collections.Iterable):


Model saved in run 9a888844cfd44f07a28a6f6dfe45e1e1


### 3.5. Serve best model

In [13]:
# Support for other libraries - https://www.mlflow.org/docs/latest/models.html

In [14]:
# Run in terminal
f'mlflow models serve --model-uri runs:/{mlflow_run_id}/{automl_model_name}'

'mlflow models serve --model-uri runs:/9a888844cfd44f07a28a6f6dfe45e1e1/H2OGradientBoostingEstimator'

In [15]:
# Run in another terminal
test_data = y_test[:10]
formatted_test_data = f'"columns":[0],"index":{list(range(test_data.shape[0]))},"data":{test_data}'
formatted_test_data = '{' + formatted_test_data + '}'

f"""curl -d '{formatted_test_data}' -H 'Content-Type: application/json'  127.0.0.1:5000/invocations"""

'curl -d \'{"columns":[0],"index":[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],"data":[1 0 2 1 1 0 1 2 1 1]}\' -H \'Content-Type: application/json\'  127.0.0.1:5000/invocations'